In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import timeit

from sklearn import preprocessing
from sklearn.preprocessing import Imputer
import random
from fancyimpute import KNN,mice,MICE
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error,mean_absolute_error, accuracy_score
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import xgboost as xgb
from xgboost import XGBRegressor,XGBClassifier
from xgboost import plot_importance

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import metrics
from keras import backend as K
from keras.wrappers.scikit_learn import KerasRegressor

from sklearn.cluster import KMeans
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.datasets.samples_generator import make_blobs

from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel

from sklearn.base import TransformerMixin, BaseEstimator
class CustomImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='mean',filler='NA'):
        self.strategy = strategy
        self.fill = filler

    def fit(self, X, y=None):
        #if self.strategy in ['mean','median']:
        #    if not all(X.dtypes == np.number):
        #        raise ValueError('dtypes mismatch np.number dtype is \
         #                        required for '+ self.strategy)
        if self.strategy == 'mean':
            self.fill = X.mean()
        elif self.strategy == 'median':
            self.fill = X.median()
        elif self.strategy == 'mode':
            self.fill = X.mode().iloc[0]
        elif self.strategy == 'fill':
            if type(self.fill) is list and type(X) is pd.DataFrame:
                self.fill = dict([(cname, v) for cname,v in zip(X.columns, self.fill)])
        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

1 Data Preparation

In [56]:
# the whole dataset
df = pd.read_csv("MEPS_all_feature_filter0.8.csv")
df = df.select_dtypes(exclude=['object'])
df = df.iloc[:,5:]
df.shape
Out[56]:
(35427, 1816)
In [57]:
# disease/diagnostic variables
df_health = df.loc[:,"RTHLTH31":"DSFLNV53"]
print(df_health.shape)
(35427, 284)
In [58]:
# pre_selected features, for example BMI, poverty, education, insurance edc
df_exist_f = pd.read_csv("MEPS_select_add_feature_filter0.8_nototexp15.csv")
df_exist_f.shape
Out[58]:
(35427, 55)
In [59]:
# combine health and preselected feature
# this is our feature matrix for use
df_c = pd.concat([df_exist_f, df_health],axis =1)
df_c = df_c.loc[:,~df_c.columns.duplicated()]### Feature matrix
df_c.shape
Out[59]:
(35427, 312)
In [60]:
#num/cat variables # briefly select categorical variables; notice this not a presise way
l={}
for x in df.columns:
    l[x] = (len(df[str(x)].unique()))
cat_columns = []
num_columns = []
for k,v in l.items():
    if v<=6:
        cat_columns.append(k)
    else:
        num_columns.append(k)
In [61]:
# IMPUTATE
In [ ]:
tot_list = list(df_c)
cat = [x for x in tot_list if x not in num_columns]
df_num_ord = df.filter(items = num_columns)
df_cat = df.filter(items = cat)
#################
#for categorical 
print("imputing variables...")
df_cat = df_cat.fillna(0)
#################
#for numerical
df_num_ord=pd.DataFrame(data=KNN(k=3).complete(df_num_ord), columns=df_num_ord.columns, index=df_num_ord.index)
#df_num_ord = df_num
####  one hot categorical/ concatenate to num
print("encoding categorical variables...")
enc = OneHotEncoder()
enc.fit(df_cat)  


encoded=pd.DataFrame(enc.transform(df_cat).toarray())
df_preprocessed = pd.concat([encoded,df_num_ord],axis =1)
# df_preprocessed cat->encoded num->knn
In [ ]:
df_preprocessed.shape
In [91]:
tot_list = list(df_c)
cat = [x for x in tot_list if x not in num_columns]
df_num_ord = df_c.filter(items = num_columns)
df_cat = df_c.filter(items = cat)
#################
#for categorical 
print("imputing variables...")
df_cat = df_cat.fillna(0)
#################
#for numerical
df_num_ord=pd.DataFrame(data=KNN(k=3).complete(df_num_ord), columns=df_num_ord.columns, index = df_num_ord.index)
#df_num_ord = df_num
####  one hot categorical/ concatenate to num
print("encoding categorical variables...")
enc = OneHotEncoder()
enc.fit(df_cat)  

# for mice imputation;
# notice MULTI REGRESSION IMPUTATION METHODS generally require random assumption
# it's likely no the case; 
# not sure if this is adding more noise; should be; will check by model perform
encoded=pd.DataFrame(enc.transform(df_cat).toarray())
df_preprocessed = pd.concat([encoded,df_num_ord],axis =1)
imputing variables...
Imputing row 1/35427 with 6 missing, elapsed time: 621.938
Imputing row 101/35427 with 26 missing, elapsed time: 624.270
Imputing row 201/35427 with 21 missing, elapsed time: 625.465
Imputing row 301/35427 with 23 missing, elapsed time: 626.289
Imputing row 401/35427 with 17 missing, elapsed time: 627.225
Imputing row 501/35427 with 15 missing, elapsed time: 627.834
Imputing row 601/35427 with 17 missing, elapsed time: 628.427
Imputing row 701/35427 with 23 missing, elapsed time: 629.139
Imputing row 801/35427 with 22 missing, elapsed time: 629.779
Imputing row 901/35427 with 17 missing, elapsed time: 630.467
Imputing row 1001/35427 with 17 missing, elapsed time: 631.036
Imputing row 1101/35427 with 22 missing, elapsed time: 631.601
Imputing row 1201/35427 with 14 missing, elapsed time: 632.395
Imputing row 1301/35427 with 22 missing, elapsed time: 633.569
Imputing row 1401/35427 with 11 missing, elapsed time: 634.134
Imputing row 1501/35427 with 18 missing, elapsed time: 634.620
Imputing row 1601/35427 with 16 missing, elapsed time: 635.201
Imputing row 1701/35427 with 17 missing, elapsed time: 636.099
Imputing row 1801/35427 with 14 missing, elapsed time: 636.654
Imputing row 1901/35427 with 21 missing, elapsed time: 637.421
Imputing row 2001/35427 with 22 missing, elapsed time: 638.194
Imputing row 2101/35427 with 19 missing, elapsed time: 638.705
Imputing row 2201/35427 with 23 missing, elapsed time: 639.268
Imputing row 2301/35427 with 18 missing, elapsed time: 639.756
Imputing row 2401/35427 with 15 missing, elapsed time: 640.380
Imputing row 2501/35427 with 21 missing, elapsed time: 641.056
Imputing row 2601/35427 with 22 missing, elapsed time: 641.526
Imputing row 2701/35427 with 23 missing, elapsed time: 642.291
Imputing row 2801/35427 with 14 missing, elapsed time: 642.868
Imputing row 2901/35427 with 15 missing, elapsed time: 643.349
Imputing row 3001/35427 with 12 missing, elapsed time: 644.033
Imputing row 3101/35427 with 21 missing, elapsed time: 644.486
Imputing row 3201/35427 with 14 missing, elapsed time: 645.181
Imputing row 3301/35427 with 23 missing, elapsed time: 645.725
Imputing row 3401/35427 with 17 missing, elapsed time: 646.243
Imputing row 3501/35427 with 17 missing, elapsed time: 646.697
Imputing row 3601/35427 with 22 missing, elapsed time: 647.376
Imputing row 3701/35427 with 21 missing, elapsed time: 647.997
Imputing row 3801/35427 with 17 missing, elapsed time: 648.696
Imputing row 3901/35427 with 22 missing, elapsed time: 649.523
Imputing row 4001/35427 with 21 missing, elapsed time: 650.222
Imputing row 4101/35427 with 17 missing, elapsed time: 650.865
Imputing row 4201/35427 with 20 missing, elapsed time: 651.527
Imputing row 4301/35427 with 15 missing, elapsed time: 652.277
Imputing row 4401/35427 with 13 missing, elapsed time: 652.749
Imputing row 4501/35427 with 13 missing, elapsed time: 653.715
Imputing row 4601/35427 with 25 missing, elapsed time: 654.572
Imputing row 4701/35427 with 15 missing, elapsed time: 655.164
Imputing row 4801/35427 with 23 missing, elapsed time: 656.399
Imputing row 4901/35427 with 22 missing, elapsed time: 657.100
Imputing row 5001/35427 with 22 missing, elapsed time: 657.547
Imputing row 5101/35427 with 22 missing, elapsed time: 658.244
Imputing row 5201/35427 with 16 missing, elapsed time: 658.999
Imputing row 5301/35427 with 17 missing, elapsed time: 659.563
Imputing row 5401/35427 with 15 missing, elapsed time: 660.389
Imputing row 5501/35427 with 17 missing, elapsed time: 661.098
Imputing row 5601/35427 with 23 missing, elapsed time: 661.914
Imputing row 5701/35427 with 14 missing, elapsed time: 662.927
Imputing row 5801/35427 with 17 missing, elapsed time: 663.699
Imputing row 5901/35427 with 13 missing, elapsed time: 664.558
Imputing row 6001/35427 with 16 missing, elapsed time: 665.375
Imputing row 6101/35427 with 21 missing, elapsed time: 666.128
Imputing row 6201/35427 with 10 missing, elapsed time: 666.921
Imputing row 6301/35427 with 16 missing, elapsed time: 667.472
Imputing row 6401/35427 with 13 missing, elapsed time: 668.157
Imputing row 6501/35427 with 22 missing, elapsed time: 669.108
Imputing row 6601/35427 with 22 missing, elapsed time: 670.143
Imputing row 6701/35427 with 21 missing, elapsed time: 670.981
Imputing row 6801/35427 with 14 missing, elapsed time: 671.748
Imputing row 6901/35427 with 17 missing, elapsed time: 672.443
Imputing row 7001/35427 with 12 missing, elapsed time: 672.999
Imputing row 7101/35427 with 11 missing, elapsed time: 673.586
Imputing row 7201/35427 with 25 missing, elapsed time: 674.176
Imputing row 7301/35427 with 22 missing, elapsed time: 674.738
Imputing row 7401/35427 with 14 missing, elapsed time: 675.457
Imputing row 7501/35427 with 21 missing, elapsed time: 675.903
Imputing row 7601/35427 with 15 missing, elapsed time: 676.390
Imputing row 7701/35427 with 17 missing, elapsed time: 676.942
Imputing row 7801/35427 with 14 missing, elapsed time: 677.418
Imputing row 7901/35427 with 16 missing, elapsed time: 678.009
Imputing row 8001/35427 with 23 missing, elapsed time: 678.517
Imputing row 8101/35427 with 25 missing, elapsed time: 679.127
Imputing row 8201/35427 with 20 missing, elapsed time: 679.591
Imputing row 8301/35427 with 21 missing, elapsed time: 680.199
Imputing row 8401/35427 with 16 missing, elapsed time: 680.869
Imputing row 8501/35427 with 22 missing, elapsed time: 681.448
Imputing row 8601/35427 with 15 missing, elapsed time: 682.202
Imputing row 8701/35427 with 22 missing, elapsed time: 682.804
Imputing row 8801/35427 with 23 missing, elapsed time: 683.330
Imputing row 8901/35427 with 17 missing, elapsed time: 683.893
Imputing row 9001/35427 with 16 missing, elapsed time: 684.985
Imputing row 9101/35427 with 22 missing, elapsed time: 685.759
Imputing row 9201/35427 with 16 missing, elapsed time: 686.454
Imputing row 9301/35427 with 23 missing, elapsed time: 686.995
Imputing row 9401/35427 with 21 missing, elapsed time: 687.492
Imputing row 9501/35427 with 15 missing, elapsed time: 688.136
Imputing row 9601/35427 with 24 missing, elapsed time: 688.742
Imputing row 9701/35427 with 24 missing, elapsed time: 689.364
Imputing row 9801/35427 with 13 missing, elapsed time: 689.856
Imputing row 9901/35427 with 18 missing, elapsed time: 690.510
Imputing row 10001/35427 with 17 missing, elapsed time: 691.074
Imputing row 10101/35427 with 16 missing, elapsed time: 691.681
Imputing row 10201/35427 with 13 missing, elapsed time: 692.182
Imputing row 10301/35427 with 22 missing, elapsed time: 692.735
Imputing row 10401/35427 with 21 missing, elapsed time: 693.258
Imputing row 10501/35427 with 16 missing, elapsed time: 693.751
Imputing row 10601/35427 with 23 missing, elapsed time: 694.234
Imputing row 10701/35427 with 17 missing, elapsed time: 694.952
Imputing row 10801/35427 with 10 missing, elapsed time: 695.565
Imputing row 10901/35427 with 21 missing, elapsed time: 696.076
Imputing row 11001/35427 with 25 missing, elapsed time: 696.925
Imputing row 11101/35427 with 14 missing, elapsed time: 697.593
Imputing row 11201/35427 with 12 missing, elapsed time: 698.162
Imputing row 11301/35427 with 24 missing, elapsed time: 698.908
Imputing row 11401/35427 with 22 missing, elapsed time: 699.602
Imputing row 11501/35427 with 14 missing, elapsed time: 700.221
Imputing row 11601/35427 with 16 missing, elapsed time: 700.759
Imputing row 11701/35427 with 15 missing, elapsed time: 701.210
Imputing row 11801/35427 with 14 missing, elapsed time: 701.664
Imputing row 11901/35427 with 19 missing, elapsed time: 702.119
Imputing row 12001/35427 with 17 missing, elapsed time: 702.840
Imputing row 12101/35427 with 22 missing, elapsed time: 703.417
Imputing row 12201/35427 with 15 missing, elapsed time: 704.094
Imputing row 12301/35427 with 23 missing, elapsed time: 704.791
Imputing row 12401/35427 with 22 missing, elapsed time: 705.353
Imputing row 12501/35427 with 22 missing, elapsed time: 705.880
Imputing row 12601/35427 with 16 missing, elapsed time: 706.460
Imputing row 12701/35427 with 17 missing, elapsed time: 707.043
Imputing row 12801/35427 with 22 missing, elapsed time: 707.664
Imputing row 12901/35427 with 17 missing, elapsed time: 708.268
Imputing row 13001/35427 with 15 missing, elapsed time: 708.832
Imputing row 13101/35427 with 23 missing, elapsed time: 709.430
Imputing row 13201/35427 with 23 missing, elapsed time: 710.173
Imputing row 13301/35427 with 12 missing, elapsed time: 710.883
Imputing row 13401/35427 with 10 missing, elapsed time: 711.376
Imputing row 13501/35427 with 21 missing, elapsed time: 711.943
Imputing row 13601/35427 with 22 missing, elapsed time: 712.613
Imputing row 13701/35427 with 23 missing, elapsed time: 713.284
Imputing row 13801/35427 with 17 missing, elapsed time: 713.884
Imputing row 13901/35427 with 23 missing, elapsed time: 714.354
Imputing row 14001/35427 with 23 missing, elapsed time: 714.999
Imputing row 14101/35427 with 22 missing, elapsed time: 715.638
Imputing row 14201/35427 with 15 missing, elapsed time: 716.140
Imputing row 14301/35427 with 26 missing, elapsed time: 716.621
Imputing row 14401/35427 with 22 missing, elapsed time: 717.129
Imputing row 14501/35427 with 15 missing, elapsed time: 717.735
Imputing row 14601/35427 with 10 missing, elapsed time: 718.323
Imputing row 14701/35427 with 15 missing, elapsed time: 718.856
Imputing row 14801/35427 with 15 missing, elapsed time: 719.370
Imputing row 14901/35427 with 20 missing, elapsed time: 719.844
Imputing row 15001/35427 with 15 missing, elapsed time: 720.331
Imputing row 15101/35427 with 24 missing, elapsed time: 720.926
Imputing row 15201/35427 with 14 missing, elapsed time: 721.471
Imputing row 15301/35427 with 23 missing, elapsed time: 721.995
Imputing row 15401/35427 with 17 missing, elapsed time: 722.635
Imputing row 15501/35427 with 13 missing, elapsed time: 723.134
Imputing row 15601/35427 with 21 missing, elapsed time: 723.980
Imputing row 15701/35427 with 23 missing, elapsed time: 724.624
Imputing row 15801/35427 with 16 missing, elapsed time: 725.220
Imputing row 15901/35427 with 22 missing, elapsed time: 725.824
Imputing row 16001/35427 with 22 missing, elapsed time: 726.322
Imputing row 16101/35427 with 16 missing, elapsed time: 726.909
Imputing row 16201/35427 with 17 missing, elapsed time: 727.495
Imputing row 16301/35427 with 17 missing, elapsed time: 728.151
Imputing row 16401/35427 with 22 missing, elapsed time: 728.749
Imputing row 16501/35427 with 13 missing, elapsed time: 729.239
Imputing row 16601/35427 with 14 missing, elapsed time: 729.929
Imputing row 16701/35427 with 17 missing, elapsed time: 730.474
Imputing row 16801/35427 with 18 missing, elapsed time: 731.100
Imputing row 16901/35427 with 22 missing, elapsed time: 731.771
Imputing row 17001/35427 with 17 missing, elapsed time: 732.275
Imputing row 17101/35427 with 16 missing, elapsed time: 732.798
Imputing row 17201/35427 with 13 missing, elapsed time: 733.352
Imputing row 17301/35427 with 23 missing, elapsed time: 734.023
Imputing row 17401/35427 with 21 missing, elapsed time: 734.806
Imputing row 17501/35427 with 22 missing, elapsed time: 735.606
Imputing row 17601/35427 with 22 missing, elapsed time: 736.228
Imputing row 17701/35427 with 23 missing, elapsed time: 736.741
Imputing row 17801/35427 with 25 missing, elapsed time: 737.314
Imputing row 17901/35427 with 25 missing, elapsed time: 737.894
Imputing row 18001/35427 with 14 missing, elapsed time: 738.430
Imputing row 18101/35427 with 22 missing, elapsed time: 739.100
Imputing row 18201/35427 with 25 missing, elapsed time: 739.647
Imputing row 18301/35427 with 17 missing, elapsed time: 740.155
Imputing row 18401/35427 with 24 missing, elapsed time: 740.766
Imputing row 18501/35427 with 16 missing, elapsed time: 741.231
Imputing row 18601/35427 with 17 missing, elapsed time: 741.850
Imputing row 18701/35427 with 24 missing, elapsed time: 742.350
Imputing row 18801/35427 with 17 missing, elapsed time: 743.057
Imputing row 18901/35427 with 22 missing, elapsed time: 743.688
Imputing row 19001/35427 with 13 missing, elapsed time: 744.230
Imputing row 19101/35427 with 23 missing, elapsed time: 744.837
Imputing row 19201/35427 with 24 missing, elapsed time: 745.470
Imputing row 19301/35427 with 15 missing, elapsed time: 746.153
Imputing row 19401/35427 with 26 missing, elapsed time: 746.835
Imputing row 19501/35427 with 23 missing, elapsed time: 747.334
Imputing row 19601/35427 with 19 missing, elapsed time: 748.081
Imputing row 19701/35427 with 17 missing, elapsed time: 748.705
Imputing row 19801/35427 with 16 missing, elapsed time: 749.258
Imputing row 19901/35427 with 23 missing, elapsed time: 749.769
Imputing row 20001/35427 with 22 missing, elapsed time: 750.285
Imputing row 20101/35427 with 26 missing, elapsed time: 751.060
Imputing row 20201/35427 with 24 missing, elapsed time: 751.746
Imputing row 20301/35427 with 10 missing, elapsed time: 752.307
Imputing row 20401/35427 with 14 missing, elapsed time: 752.902
Imputing row 20501/35427 with 24 missing, elapsed time: 753.557
Imputing row 20601/35427 with 15 missing, elapsed time: 754.089
Imputing row 20701/35427 with 23 missing, elapsed time: 754.666
Imputing row 20801/35427 with 16 missing, elapsed time: 755.247
Imputing row 20901/35427 with 24 missing, elapsed time: 755.765
Imputing row 21001/35427 with 15 missing, elapsed time: 756.311
Imputing row 21101/35427 with 17 missing, elapsed time: 756.807
Imputing row 21201/35427 with 14 missing, elapsed time: 757.371
Imputing row 21301/35427 with 16 missing, elapsed time: 757.998
Imputing row 21401/35427 with 18 missing, elapsed time: 758.549
Imputing row 21501/35427 with 24 missing, elapsed time: 759.079
Imputing row 21601/35427 with 16 missing, elapsed time: 759.815
Imputing row 21701/35427 with 13 missing, elapsed time: 760.363
Imputing row 21801/35427 with 18 missing, elapsed time: 760.972
Imputing row 21901/35427 with 17 missing, elapsed time: 761.546
Imputing row 22001/35427 with 13 missing, elapsed time: 762.124
Imputing row 22101/35427 with 16 missing, elapsed time: 762.663
Imputing row 22201/35427 with 16 missing, elapsed time: 763.261
Imputing row 22301/35427 with 17 missing, elapsed time: 763.809
Imputing row 22401/35427 with 16 missing, elapsed time: 764.303
Imputing row 22501/35427 with 22 missing, elapsed time: 764.847
Imputing row 22601/35427 with 12 missing, elapsed time: 765.352
Imputing row 22701/35427 with 23 missing, elapsed time: 765.882
Imputing row 22801/35427 with 13 missing, elapsed time: 766.433
Imputing row 22901/35427 with 15 missing, elapsed time: 767.209
Imputing row 23001/35427 with 23 missing, elapsed time: 767.712
Imputing row 23101/35427 with 17 missing, elapsed time: 768.236
Imputing row 23201/35427 with 24 missing, elapsed time: 768.832
Imputing row 23301/35427 with 17 missing, elapsed time: 769.329
Imputing row 23401/35427 with 16 missing, elapsed time: 769.903
Imputing row 23501/35427 with 14 missing, elapsed time: 770.457
Imputing row 23601/35427 with 22 missing, elapsed time: 771.070
Imputing row 23701/35427 with 16 missing, elapsed time: 771.676
Imputing row 23801/35427 with 25 missing, elapsed time: 772.170
Imputing row 23901/35427 with 22 missing, elapsed time: 772.732
Imputing row 24001/35427 with 18 missing, elapsed time: 773.340
Imputing row 24101/35427 with 25 missing, elapsed time: 773.994
Imputing row 24201/35427 with 22 missing, elapsed time: 774.741
Imputing row 24301/35427 with 10 missing, elapsed time: 775.282
Imputing row 24401/35427 with 18 missing, elapsed time: 775.852
Imputing row 24501/35427 with 20 missing, elapsed time: 776.435
Imputing row 24601/35427 with 25 missing, elapsed time: 777.094
Imputing row 24701/35427 with 20 missing, elapsed time: 777.873
Imputing row 24801/35427 with 15 missing, elapsed time: 778.403
Imputing row 24901/35427 with 18 missing, elapsed time: 779.037
Imputing row 25001/35427 with 18 missing, elapsed time: 779.770
Imputing row 25101/35427 with 19 missing, elapsed time: 780.302
Imputing row 25201/35427 with 22 missing, elapsed time: 780.779
Imputing row 25301/35427 with 18 missing, elapsed time: 781.241
Imputing row 25401/35427 with 23 missing, elapsed time: 781.915
Imputing row 25501/35427 with 19 missing, elapsed time: 782.625
Imputing row 25601/35427 with 19 missing, elapsed time: 783.230
Imputing row 25701/35427 with 25 missing, elapsed time: 783.768
Imputing row 25801/35427 with 16 missing, elapsed time: 784.308
Imputing row 25901/35427 with 11 missing, elapsed time: 784.837
Imputing row 26001/35427 with 23 missing, elapsed time: 785.376
Imputing row 26101/35427 with 17 missing, elapsed time: 786.015
Imputing row 26201/35427 with 15 missing, elapsed time: 786.664
Imputing row 26301/35427 with 18 missing, elapsed time: 787.200
Imputing row 26401/35427 with 23 missing, elapsed time: 787.742
Imputing row 26501/35427 with 22 missing, elapsed time: 788.266
Imputing row 26601/35427 with 17 missing, elapsed time: 788.812
Imputing row 26701/35427 with 14 missing, elapsed time: 789.333
Imputing row 26801/35427 with 23 missing, elapsed time: 790.067
Imputing row 26901/35427 with 16 missing, elapsed time: 790.590
Imputing row 27001/35427 with 23 missing, elapsed time: 791.161
Imputing row 27101/35427 with 23 missing, elapsed time: 791.885
Imputing row 27201/35427 with 13 missing, elapsed time: 793.458
Imputing row 27301/35427 with 15 missing, elapsed time: 794.261
Imputing row 27401/35427 with 23 missing, elapsed time: 795.103
Imputing row 27501/35427 with 25 missing, elapsed time: 795.877
Imputing row 27601/35427 with 16 missing, elapsed time: 796.431
Imputing row 27701/35427 with 18 missing, elapsed time: 797.179
Imputing row 27801/35427 with 16 missing, elapsed time: 797.806
Imputing row 27901/35427 with 17 missing, elapsed time: 798.582
Imputing row 28001/35427 with 18 missing, elapsed time: 799.122
Imputing row 28101/35427 with 24 missing, elapsed time: 799.900
Imputing row 28201/35427 with 23 missing, elapsed time: 800.445
Imputing row 28301/35427 with 17 missing, elapsed time: 801.007
Imputing row 28401/35427 with 13 missing, elapsed time: 801.642
Imputing row 28501/35427 with 16 missing, elapsed time: 802.273
Imputing row 28601/35427 with 14 missing, elapsed time: 802.844
Imputing row 28701/35427 with 18 missing, elapsed time: 803.419
Imputing row 28801/35427 with 25 missing, elapsed time: 803.985
Imputing row 28901/35427 with 13 missing, elapsed time: 804.533
Imputing row 29001/35427 with 17 missing, elapsed time: 805.088
Imputing row 29101/35427 with 22 missing, elapsed time: 805.658
Imputing row 29201/35427 with 15 missing, elapsed time: 806.211
Imputing row 29301/35427 with 14 missing, elapsed time: 806.838
Imputing row 29401/35427 with 17 missing, elapsed time: 807.439
Imputing row 29501/35427 with 19 missing, elapsed time: 808.008
Imputing row 29601/35427 with 12 missing, elapsed time: 808.653
Imputing row 29701/35427 with 23 missing, elapsed time: 809.261
Imputing row 29801/35427 with 24 missing, elapsed time: 809.852
Imputing row 29901/35427 with 17 missing, elapsed time: 810.453
Imputing row 30001/35427 with 22 missing, elapsed time: 811.103
Imputing row 30101/35427 with 24 missing, elapsed time: 811.851
Imputing row 30201/35427 with 17 missing, elapsed time: 813.228
Imputing row 30301/35427 with 15 missing, elapsed time: 813.865
Imputing row 30401/35427 with 23 missing, elapsed time: 814.404
Imputing row 30501/35427 with 22 missing, elapsed time: 814.968
Imputing row 30601/35427 with 23 missing, elapsed time: 815.558
Imputing row 30701/35427 with 22 missing, elapsed time: 816.110
Imputing row 30801/35427 with 24 missing, elapsed time: 816.682
Imputing row 30901/35427 with 15 missing, elapsed time: 817.208
Imputing row 31001/35427 with 23 missing, elapsed time: 817.781
Imputing row 31101/35427 with 16 missing, elapsed time: 818.378
Imputing row 31201/35427 with 13 missing, elapsed time: 818.960
Imputing row 31301/35427 with 17 missing, elapsed time: 819.627
Imputing row 31401/35427 with 18 missing, elapsed time: 820.154
Imputing row 31501/35427 with 18 missing, elapsed time: 820.709
Imputing row 31601/35427 with 24 missing, elapsed time: 821.306
Imputing row 31701/35427 with 23 missing, elapsed time: 821.869
Imputing row 31801/35427 with 18 missing, elapsed time: 822.444
Imputing row 31901/35427 with 22 missing, elapsed time: 823.014
Imputing row 32001/35427 with 23 missing, elapsed time: 823.612
Imputing row 32101/35427 with 16 missing, elapsed time: 824.179
Imputing row 32201/35427 with 17 missing, elapsed time: 824.746
Imputing row 32301/35427 with 22 missing, elapsed time: 825.312
Imputing row 32401/35427 with 18 missing, elapsed time: 825.993
Imputing row 32501/35427 with 23 missing, elapsed time: 827.072
Imputing row 32601/35427 with 22 missing, elapsed time: 827.772
Imputing row 32701/35427 with 19 missing, elapsed time: 828.678
Imputing row 32801/35427 with 24 missing, elapsed time: 829.439
Imputing row 32901/35427 with 14 missing, elapsed time: 830.011
Imputing row 33001/35427 with 17 missing, elapsed time: 830.567
Imputing row 33101/35427 with 24 missing, elapsed time: 831.222
Imputing row 33201/35427 with 13 missing, elapsed time: 831.836
Imputing row 33301/35427 with 23 missing, elapsed time: 832.531
Imputing row 33401/35427 with 17 missing, elapsed time: 833.190
Imputing row 33501/35427 with 15 missing, elapsed time: 833.801
Imputing row 33601/35427 with 18 missing, elapsed time: 834.425
Imputing row 33701/35427 with 25 missing, elapsed time: 835.016
Imputing row 33801/35427 with 17 missing, elapsed time: 835.683
Imputing row 33901/35427 with 22 missing, elapsed time: 836.293
Imputing row 34001/35427 with 17 missing, elapsed time: 837.014
Imputing row 34101/35427 with 12 missing, elapsed time: 837.980
Imputing row 34201/35427 with 12 missing, elapsed time: 838.688
Imputing row 34301/35427 with 16 missing, elapsed time: 839.255
Imputing row 34401/35427 with 26 missing, elapsed time: 839.874
Imputing row 34501/35427 with 11 missing, elapsed time: 840.456
Imputing row 34601/35427 with 16 missing, elapsed time: 840.997
Imputing row 34701/35427 with 23 missing, elapsed time: 841.563
Imputing row 34801/35427 with 23 missing, elapsed time: 842.128
Imputing row 34901/35427 with 13 missing, elapsed time: 842.680
Imputing row 35001/35427 with 23 missing, elapsed time: 843.221
Imputing row 35101/35427 with 15 missing, elapsed time: 843.815
Imputing row 35201/35427 with 21 missing, elapsed time: 844.400
Imputing row 35301/35427 with 17 missing, elapsed time: 845.718
Imputing row 35401/35427 with 18 missing, elapsed time: 846.569
encoding categorical variables...

2 EDA

Examine disease's effect on expenditure

In [ ]:
# 105
In [62]:
disease_f = ["CANCERDX", "CHDDX","STRKDX","OHRTDX","HIBPDX","EMPHDX","DIABDX",
#cancer/ coronary heart disease/ stroke/ other heart disease/ high blood pressure/ Emphysema肺气肿/Diabetes糖尿病
            "ARTHDX", "ASTHDX", "ADHDADDX","PREGNT31","IADLHP31","ADLHLP31","AIDHLP31"]
#           arthrits关节炎/asthma/ 多动症/ pregnant / independent living screener
In [59]:
# count plot for each disease
for i, x in enumerate(disease_f):
    plt.figure(i)
    print(x)
    sns.countplot(df[x].fillna(3))
    plt.show()
CANCERDX
CHDDX
STRKDX
OHRTDX
HIBPDX
EMPHDX
DIABDX
ARTHDX
ASTHDX
ADHDADDX
PREGNT31
IADLHP31
ADLHLP31
AIDHLP31
In [60]:
# plot expenditure across each disease
log_totexp = np.log(df.TOTEXP15+1)
df_explog = pd.concat([df.drop(["TOTEXP15"], axis=1), log_totexp], axis =1)
for i, cols in enumerate(disease_f):
    plt.figure(i)
    sns.violinplot(x=cols, y="TOTEXP15", data=df_explog[df_explog.TOTEXP15 >0] .fillna(3)).set_title(str(cols))
    plt.show() # this is different than 6.11plot as is acroos dis level
In [61]:
#######
In [62]:
###
df_mean = df.groupby(['CANCERDX']).mean()
print(df_mean.OBVEXP15) # office
print(df_mean.OPVEXP15) # out patient
print(df_mean.ERTEXP15) # emergency profit
print(df_mean.TOTEXP15) # total
CANCERDX
1.0    3349.815672
2.0     994.093106
Name: OBVEXP15, dtype: float64
CANCERDX
1.0    710.564114
2.0    193.467617
Name: OPVEXP15, dtype: float64
CANCERDX
1.0    333.402493
2.0    221.427702
Name: ERTEXP15, dtype: float64
CANCERDX
1.0    13391.691451
2.0     4414.898043
Name: TOTEXP15, dtype: float64
In [63]:
df_mean = df.groupby(['PREGNT31']).mean() 
print(df_mean.OBVEXP15) # office based 
print(df_mean.OPVEXP15) # out patient
print(df_mean.ERTEXP15) # emergency room expenditure
print(df_mean.TOTEXP15) # total
PREGNT31
1.0    1932.100503
2.0     867.481243
Name: OBVEXP15, dtype: float64
PREGNT31
1.0    221.087940
2.0    182.052825
Name: OPVEXP15, dtype: float64
PREGNT31
1.0    454.454774
2.0    210.892666
Name: ERTEXP15, dtype: float64
PREGNT31
1.0    10327.394472
2.0     2938.739550
Name: TOTEXP15, dtype: float64
In [64]:
df_mean = df.groupby(['STRKDX']).mean() #stroke
print(df_mean.OBVEXP15) # office
print(df_mean.OPVEXP15) # out patient
print(df_mean.ERTEXP15) # emergency profit
print(df_mean.TOTEXP15) # total
# 
print("-----------------")
#

df_mean = df.groupby(['CHDDX']).median()
print(df_mean.OBVEXP15)
STRKDX
1.0    2768.328571
2.0    1132.805287
Name: OBVEXP15, dtype: float64
STRKDX
1.0    491.140952
2.0    227.774683
Name: OPVEXP15, dtype: float64
STRKDX
1.0    555.519048
2.0    217.345181
Name: ERTEXP15, dtype: float64
STRKDX
1.0    18682.950476
2.0     4622.223536
Name: TOTEXP15, dtype: float64
-----------------
CHDDX
1.0    1079.5
2.0     193.0
Name: OBVEXP15, dtype: float64
In [65]:
df_mean = df.groupby(['DIABDX']).mean() #diabetes
print(df_mean.OBVEXP15) # office
print(df_mean.OPVEXP15) # out patient
print(df_mean.ERTEXP15) # emergency profit
print(df_mean.TOTEXP15) # total
DIABDX
1.0    2314.256675
2.0    1063.194725
Name: OBVEXP15, dtype: float64
DIABDX
1.0    405.881096
2.0    218.059983
Name: OPVEXP15, dtype: float64
DIABDX
1.0    303.432538
2.0    222.320445
Name: ERTEXP15, dtype: float64
DIABDX
1.0    12747.060164
2.0     4273.786225
Name: TOTEXP15, dtype: float64
In [124]:
# correlation matrix
df_exp = df.loc[:,"TOTEXP15":"RXEXP15"]
df_check_cor = pd.concat([df_c, df_exp.TOTEXP15],axis =1)
df_check_cor = df_check_cor.loc[:,~df_check_cor.columns.duplicated()]
df_check_cor = df_check_cor.filter(items = num_columns)
df_check_cor.rename(columns={"TOTEXP15": "Total_Expenditure", "RXTOT15": "Total Office_Based Visits",
                               "OBTOTV15": "Total Out_patients Visits", "TRBLE42": "Total Prescribed Medcine",
                               "HAVFUN42": "Age", "HOMEBH42": "Attitude towards Insurance",
                               "ADAPPT42": "Family Income Index", "ERTOT15": "TOTAL Emergency Room Visits",
                               "OPTOTV15": "BMI", "KIDPRO42": "Perceived Health Status",
                               "OBOTHV15": "Working Hours", "ASTHAGED": "Cancer Diagnosis",
                                "AGELAST": "Office Based Non-Physician Visits", "AGE53X": "Highest Education"}, inplace=True)
corr = df_check_cor.corr()
corr
Out[124]:
FAMSZEYR Highest Education Office Based Non-Physician Visits MARRY31X EDUYRDG EDRECODE HIBPAGED CHDAGED ANGIAGED MIAGED ... TYPEPE42 HOUR31 POVLEV15 VETSP15X Total_Expenditure Total Out_patients Visits Working Hours BMI TOTAL Emergency Room Visits Total Office_Based Visits
FAMSZEYR 1.000000 -0.454152 -0.454957 0.159860 -0.144932 -0.341420 -0.198482 -0.229130 -0.240105 -0.199657 ... 0.009986 -0.033157 -0.174586 -0.044759 -0.146677 -0.172734 -0.108705 -0.082637 -0.090447 -0.262473
Highest Education -0.454152 1.000000 0.999918 -0.623679 0.066208 0.449939 0.741372 0.751920 0.713442 0.751632 ... -0.052270 0.098927 0.194513 0.053212 0.214632 0.223858 0.125746 0.116574 0.074616 0.411737
Office Based Non-Physician Visits -0.454957 0.999918 1.000000 -0.623212 0.064993 0.445757 0.739677 0.742089 0.714402 0.749247 ... -0.052272 0.098241 0.190350 0.053477 0.216429 0.221383 0.124026 0.115471 0.077651 0.409697
MARRY31X 0.159860 -0.623679 -0.623212 1.000000 -0.155873 -0.409117 -0.194508 -0.137465 -0.189456 -0.120836 ... 0.071061 -0.106604 -0.242948 -0.046511 -0.111988 -0.121706 -0.071826 -0.064957 -0.000652 -0.187272
EDUYRDG -0.144932 0.066208 0.064993 -0.155873 1.000000 0.864215 -0.064235 0.005532 0.031582 0.043332 ... -0.008772 0.107471 0.279866 0.019828 0.032535 0.074246 0.054835 0.026322 0.011240 -0.002113
EDRECODE -0.341420 0.449939 0.445757 -0.409117 0.864215 1.000000 -0.082957 -0.019508 0.013637 -0.040828 ... -0.044292 0.084335 0.307090 0.050968 0.086726 0.112074 0.082026 0.061724 0.028597 0.099781
HIBPAGED -0.198482 0.741372 0.739677 -0.194508 -0.064235 -0.082957 1.000000 0.557546 0.556852 0.571858 ... 0.033863 -0.027639 0.068161 -0.015252 0.035594 0.059945 0.024417 0.015898 -0.027557 0.054219
CHDAGED -0.229130 0.751920 0.742089 -0.137465 0.005532 -0.019508 0.557546 1.000000 0.807207 0.779497 ... 0.075621 -0.088602 0.050217 -0.018251 0.025596 0.058604 0.026447 -0.029892 0.001199 0.062756
ANGIAGED -0.240105 0.713442 0.714402 -0.189456 0.031582 0.013637 0.556852 0.807207 1.000000 0.733543 ... -0.011225 -0.027165 0.070852 0.002457 0.015473 0.131121 0.080883 -0.012013 -0.001819 0.055101
MIAGED -0.199657 0.751632 0.749247 -0.120836 0.043332 -0.040828 0.571858 0.779497 0.733543 1.000000 ... -0.027960 -0.118196 0.038875 -0.002172 0.021324 0.029281 -0.020974 -0.021000 -0.065926 0.009062
OHRTAGED -0.278819 0.749838 0.749450 -0.193842 -0.124427 -0.109749 0.478914 0.776930 0.615703 0.706008 ... 0.019270 -0.046883 -0.021253 0.027139 0.162739 0.145095 0.059437 0.090972 0.077209 0.260999
STRKAGED -0.253565 0.834896 0.832093 -0.194569 -0.033855 -0.050098 0.609027 0.658207 0.585902 0.712243 ... 0.021245 -0.054945 0.052496 -0.006627 0.035621 0.031458 -0.010214 -0.013087 0.018428 0.085749
EMPHAGED -0.072778 0.728459 0.726218 -0.217510 0.098255 0.012797 0.520847 0.436891 0.358389 0.670061 ... 0.042023 -0.019096 0.155397 0.041374 0.036978 0.000615 -0.058004 0.032477 -0.076638 0.003144
CHOLAGED -0.254395 0.829270 0.829242 -0.161891 -0.109494 -0.095219 0.706496 0.631409 0.589210 0.660266 ... 0.000175 -0.077363 -0.022952 -0.022393 0.077723 0.095935 0.038542 0.027601 0.012210 0.148952
DIABAGED -0.218713 0.764156 0.763655 -0.173434 0.020584 -0.049483 0.650065 0.551972 0.525956 0.552089 ... -0.040324 -0.021950 0.088597 -0.003229 -0.002807 0.000217 -0.017821 -0.011003 -0.067389 0.011685
ARTHAGED -0.230822 0.772485 0.771115 -0.192301 -0.047865 -0.086521 0.561774 0.483887 0.463736 0.531187 ... -0.009091 -0.081422 0.085738 -0.026685 0.052489 0.054056 0.003802 0.015999 -0.040285 0.062703
Cancer Diagnosis -0.318757 0.723172 0.722669 -0.440331 0.089206 0.265503 0.422302 0.204455 0.158323 0.307554 ... -0.006474 -0.051751 0.092891 0.080908 0.222402 0.202608 0.081415 0.134104 0.058577 0.395350
ADHDAGED -0.094890 0.458596 0.458353 -0.322561 -0.146140 0.181903 0.574655 NaN NaN NaN ... -0.022525 0.004099 0.104256 -0.002373 -0.018932 -0.017190 -0.023193 -0.014288 0.030533 -0.056873
MOMPRO42 -0.047440 0.027201 0.028149 -0.024937 -0.020200 0.036294 -1.000000 NaN NaN NaN ... -0.039033 0.094955 -0.017856 0.002537 0.046814 0.015056 0.015132 -0.002129 0.006612 0.061569
DADPRO42 -0.117072 0.062278 0.061434 -0.031780 -0.025621 0.024992 -1.000000 NaN NaN NaN ... -0.022291 0.069919 -0.093855 0.040436 0.046585 0.062092 0.046372 0.021096 0.041433 0.087550
UNHAP42 -0.021210 0.001249 0.000018 0.011089 -0.013117 -0.005387 -1.000000 NaN NaN NaN ... 0.106060 -0.002846 -0.029870 0.001947 0.129348 0.027797 0.021456 0.005773 0.008615 0.114242
SCHLBH42 -0.032918 -0.053349 -0.053530 -0.016060 0.132984 0.046438 -1.000000 NaN NaN NaN ... -0.026145 0.150091 -0.015257 0.001893 0.133342 0.014926 0.003458 0.004710 0.002417 0.118118
Age -0.022369 0.002375 0.000435 0.007918 -0.018809 -0.004634 -1.000000 NaN NaN NaN ... -0.008836 0.057040 -0.015433 -0.000876 0.350828 0.015701 0.006150 0.012725 -0.005192 0.212670
ADUPRO42 -0.025109 0.007312 0.005241 0.008102 -0.016835 0.000785 -1.000000 NaN NaN NaN ... -0.007024 -0.077653 -0.018721 0.002218 0.207228 0.021461 0.012809 0.014818 0.001773 0.170784
NERVAF42 -0.025169 -0.029600 -0.031263 0.022686 -0.009441 -0.011566 -1.000000 NaN NaN NaN ... -0.009120 -0.014030 -0.028127 0.002054 0.116611 0.017206 0.012236 0.009210 -0.003105 0.110141
SIBPRO42 -0.290403 -0.055602 -0.055567 0.014029 0.044684 -0.019484 -1.000000 NaN NaN NaN ... 0.030808 -0.202759 0.121299 -0.002536 0.056547 0.049392 0.040258 0.015584 0.039832 0.057192
Perceived Health Status -0.031736 -0.011035 -0.012583 0.002432 0.015918 -0.003054 -1.000000 NaN NaN NaN ... -0.016637 0.050255 -0.017025 0.001776 0.236084 0.049198 0.029394 0.013177 0.029569 0.206532
SPRPRO42 -0.017573 -0.028367 -0.027746 0.000494 0.071301 -0.002354 -1.000000 NaN NaN NaN ... -0.018487 -0.009711 -0.037231 -0.001691 0.161596 0.019432 0.009675 0.072824 0.018524 0.100864
SCHPRO42 -0.042196 -0.081124 -0.081037 -0.013479 0.224562 0.047240 -1.000000 NaN NaN NaN ... -0.004723 0.144858 -0.014406 -0.000652 0.182936 0.024411 0.008309 0.117591 0.019383 0.107346
Attitude towards Insurance -0.028393 -0.008923 -0.009955 0.005411 -0.006563 0.000250 -1.000000 NaN NaN NaN ... 0.016125 0.066316 -0.025789 0.003657 0.338262 0.040611 0.025646 0.024406 0.032655 0.230927
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
CHECK53 0.066022 -0.215665 -0.215485 0.108904 -0.111785 -0.086649 -0.152694 -0.080758 -0.163420 -0.131820 ... -0.016098 0.011687 -0.098347 -0.033514 -0.118724 -0.160869 -0.090324 -0.067202 -0.070652 -0.213355
FLUSHT53 0.099415 -0.240343 -0.240467 0.127931 -0.110910 -0.071113 -0.151745 -0.158422 -0.116893 -0.192692 ... -0.017826 0.008017 -0.097757 -0.036250 -0.141169 -0.167897 -0.095654 -0.073370 -0.064680 -0.226244
PSA53 0.218942 -0.435040 -0.434692 0.090140 -0.137642 -0.122350 -0.204208 -0.185083 -0.091678 -0.240364 ... -0.013606 0.008619 -0.147678 -0.058775 -0.149071 -0.200316 -0.115098 -0.073621 -0.067788 -0.254617
PAPSMR53 -0.065782 0.168224 0.168072 0.038567 -0.189589 -0.154997 0.260530 0.244878 0.344685 0.235081 ... 0.017880 -0.104245 -0.067145 -0.002907 0.012362 -0.022463 -0.014272 0.000960 -0.011419 0.073312
BRSTEX53 0.057057 -0.108881 -0.108995 0.120715 -0.201533 -0.170158 0.069343 0.087383 0.103892 0.163897 ... 0.026813 -0.109125 -0.140523 -0.001859 -0.045626 -0.114687 -0.074302 -0.065905 -0.022876 -0.073756
MAMOGR53 0.189648 -0.400497 -0.400589 0.126392 -0.060825 -0.061331 -0.118718 0.008431 -0.096914 0.077185 ... 0.023797 -0.023191 -0.174581 -0.009107 -0.075052 -0.137488 -0.070974 -0.110913 0.002291 -0.181314
BSTST53 0.104625 -0.251345 -0.251427 0.017941 0.020268 0.013180 -0.111219 -0.071786 -0.100853 -0.038002 ... -0.002424 0.059281 0.007953 -0.037329 -0.088379 -0.135371 -0.076999 -0.035934 -0.045584 -0.158631
CLNTST53 0.198616 -0.354884 -0.354904 0.073403 -0.091093 -0.088602 -0.126533 -0.082424 -0.037876 -0.095002 ... -0.017157 0.037037 -0.112771 -0.031145 -0.150298 -0.207668 -0.119809 -0.095400 -0.081452 -0.225649
SGMTST53 0.053694 -0.117732 -0.117830 0.004325 -0.016961 0.005628 -0.038328 -0.059639 -0.023068 -0.063151 ... -0.007880 0.029807 -0.001378 -0.009459 -0.059726 -0.072616 -0.028662 -0.039239 -0.034761 -0.076752
BMINDX53 -0.010579 0.065761 0.065878 -0.014305 -0.048027 -0.010014 -0.236086 -0.136644 -0.219522 -0.206369 ... -0.017850 0.055177 -0.081427 0.027939 0.038638 0.055171 0.027904 0.033840 0.059506 0.173436
SEATBE53 -0.039958 -0.010536 -0.010409 0.052390 -0.064335 -0.026245 -0.051025 -0.004855 -0.047991 -0.048385 ... 0.014442 0.032532 -0.061026 0.002284 0.002267 -0.011638 -0.008506 0.006346 0.034674 0.041628
ADPRXY42 -0.039324 0.083654 0.083662 0.004484 -0.083582 -0.088160 0.079659 0.063585 0.060987 0.039155 ... -0.002873 -0.040594 -0.057562 0.002888 0.043623 0.014036 -0.008037 0.002640 0.017361 0.085412
Family Income Index -0.176703 0.320813 0.320674 -0.114367 0.118411 0.077291 0.145423 0.134752 0.161568 0.175505 ... 0.064984 -0.041106 0.096843 0.061510 0.306254 0.423325 0.254591 0.174131 0.180183 0.428739
ADHECR42 -0.026957 0.101569 0.101507 -0.058276 0.059159 0.033674 0.096331 0.135342 0.206333 0.133984 ... -0.019290 0.004799 0.092994 -0.012719 -0.005005 0.010577 0.003458 0.014555 -0.062093 0.002608
PCS42 0.159484 -0.426856 -0.427102 0.088350 0.143644 0.115521 -0.150493 -0.160174 -0.127464 -0.162515 ... -0.062278 0.040393 0.146057 -0.070073 -0.298220 -0.296533 -0.160009 -0.143062 -0.234147 -0.503506
MCS42 0.053875 -0.025315 -0.025449 -0.058024 0.076562 0.073014 0.102475 0.020551 0.100977 0.061088 ... -0.011404 0.027997 0.118817 -0.034138 -0.125736 -0.133261 -0.086709 -0.049145 -0.164020 -0.235371
K6SUM42 -0.094762 0.050486 0.050651 0.060089 -0.088153 -0.067610 -0.083461 -0.050230 -0.090055 -0.068618 ... 0.018179 -0.046591 -0.129781 0.037530 0.153935 0.167527 0.106588 0.066467 0.186360 0.295966
PHQ242 -0.078070 0.071510 0.071755 0.054453 -0.110530 -0.088016 -0.058392 -0.030641 -0.069410 -0.032033 ... 0.027894 -0.040736 -0.141541 0.038361 0.148559 0.149465 0.093833 0.065726 0.170889 0.281193
ADCMPM42 -0.002170 -0.023418 -0.024716 0.003630 0.032929 0.037287 -0.038630 -0.022539 0.010395 0.003579 ... 0.019502 0.016946 0.046697 0.016807 -0.006289 0.001636 0.009354 -0.021594 -0.002621 -0.010348
DSA1C53 -0.025324 -0.025703 -0.025684 0.063428 -0.074038 -0.032894 -0.024201 0.022488 -0.164173 0.043494 ... 0.034557 0.024568 -0.049270 -0.030149 0.004982 -0.022266 -0.024778 0.002651 0.019654 -0.018593
TYPEPE42 0.009986 -0.052270 -0.052272 0.071061 -0.008772 -0.044292 0.033863 0.075621 -0.011225 -0.027960 ... 1.000000 0.012286 -0.024141 0.006338 0.021714 0.027391 0.041396 0.006106 0.011499 0.015531
HOUR31 -0.033157 0.098927 0.098241 -0.106604 0.107471 0.084335 -0.027639 -0.088602 -0.027165 -0.118196 ... 0.012286 1.000000 0.169655 -0.005278 -0.014959 -0.063163 -0.041758 -0.032291 -0.047379 -0.031594
POVLEV15 -0.174586 0.194513 0.190350 -0.242948 0.279866 0.307090 0.068161 0.050217 0.070852 0.038875 ... -0.024141 0.169655 1.000000 0.048096 0.039319 0.086961 0.074085 0.017258 -0.067938 -0.011781
VETSP15X -0.044759 0.053212 0.053477 -0.046511 0.019828 0.050968 -0.015252 -0.018251 0.002457 -0.002172 ... 0.006338 -0.005278 0.048096 1.000000 0.041410 0.048163 0.026383 0.037458 0.019540 0.073902
Total_Expenditure -0.146677 0.214632 0.216429 -0.111988 0.032535 0.086726 0.035594 0.025596 0.015473 0.021324 ... 0.021714 -0.014959 0.039319 0.041410 1.000000 0.374827 0.235925 0.260768 0.276263 0.413218
Total Out_patients Visits -0.172734 0.223858 0.221383 -0.121706 0.074246 0.112074 0.059945 0.058604 0.131121 0.029281 ... 0.027391 -0.063163 0.086961 0.048163 0.374827 1.000000 0.857173 0.147737 0.162377 0.377547
Working Hours -0.108705 0.125746 0.124026 -0.071826 0.054835 0.082026 0.024417 0.026447 0.080883 -0.020974 ... 0.041396 -0.041758 0.074085 0.026383 0.235925 0.857173 1.000000 0.094120 0.089473 0.203641
BMI -0.082637 0.116574 0.115471 -0.064957 0.026322 0.061724 0.015898 -0.029892 -0.012013 -0.021000 ... 0.006106 -0.032291 0.017258 0.037458 0.260768 0.147737 0.094120 1.000000 0.095720 0.182448
TOTAL Emergency Room Visits -0.090447 0.074616 0.077651 -0.000652 0.011240 0.028597 -0.027557 0.001199 -0.001819 -0.065926 ... 0.011499 -0.047379 -0.067938 0.019540 0.276263 0.162377 0.089473 0.095720 1.000000 0.237572
Total Office_Based Visits -0.262473 0.411737 0.409697 -0.187272 -0.002113 0.099781 0.054219 0.062756 0.055101 0.009062 ... 0.015531 -0.031594 -0.011781 0.073902 0.413218 0.377547 0.203641 0.182448 0.237572 1.000000

66 rows × 66 columns

In [125]:
corr.Total_Expenditure.sort_values(ascending=False)
Out[125]:
Total_Expenditure                    1.000000
Total Office_Based Visits            0.413218
Total Out_patients Visits            0.374827
Total Prescribed Medcine             0.363849
Age                                  0.350828
Attitude towards Insurance           0.338262
Family Income Index                  0.306254
TOTAL Emergency Room Visits          0.276263
BMI                                  0.260768
Perceived Health Status              0.236084
Working Hours                        0.235925
Cancer Diagnosis                     0.222402
Office Based Non-Physician Visits    0.216429
Highest Education                    0.214632
CHAPPT42                             0.209309
ADUPRO42                             0.207228
SCHPRO42                             0.182936
OHRTAGED                             0.162739
SPRPRO42                             0.161596
K6SUM42                              0.153935
PHQ242                               0.148559
SCHLBH42                             0.133342
UNHAP42                              0.129348
NERVAF42                             0.116611
EDRECODE                             0.086726
CHOLAGED                             0.077723
SIBPRO42                             0.056547
ARTHAGED                             0.052489
MOMPRO42                             0.046814
DADPRO42                             0.046585
                                       ...   
HIBPAGED                             0.035594
EDUYRDG                              0.032535
CHDAGED                              0.025596
TYPEPE42                             0.021714
MIAGED                               0.021324
ANGIAGED                             0.015473
PAPSMR53                             0.012362
CHBMIX42                             0.005131
DSA1C53                              0.004982
SEATBE53                             0.002267
DIABAGED                            -0.002807
ADHECR42                            -0.005005
ADCMPM42                            -0.006289
HOUR31                              -0.014959
ADHDAGED                            -0.018932
CHHECR42                            -0.024422
BRSTEX53                            -0.045626
SGMTST53                            -0.059726
MAMOGR53                            -0.075052
BSTST53                             -0.088379
MARRY31X                            -0.111988
CHECK53                             -0.118724
BPCHEK53                            -0.119744
MCS42                               -0.125736
CHOLCK53                            -0.139350
FLUSHT53                            -0.141169
FAMSZEYR                            -0.146677
PSA53                               -0.149071
CLNTST53                            -0.150298
PCS42                               -0.298220
Name: Total_Expenditure, Length: 66, dtype: float64
In [126]:
corr.Total_Expenditure.sort_values(ascending=False)
Out[126]:
Total_Expenditure                    1.000000
Total Office_Based Visits            0.413218
Total Out_patients Visits            0.374827
Total Prescribed Medcine             0.363849
Age                                  0.350828
Attitude towards Insurance           0.338262
Family Income Index                  0.306254
TOTAL Emergency Room Visits          0.276263
BMI                                  0.260768
Perceived Health Status              0.236084
Working Hours                        0.235925
Cancer Diagnosis                     0.222402
Office Based Non-Physician Visits    0.216429
Highest Education                    0.214632
CHAPPT42                             0.209309
ADUPRO42                             0.207228
SCHPRO42                             0.182936
OHRTAGED                             0.162739
SPRPRO42                             0.161596
K6SUM42                              0.153935
PHQ242                               0.148559
SCHLBH42                             0.133342
UNHAP42                              0.129348
NERVAF42                             0.116611
EDRECODE                             0.086726
CHOLAGED                             0.077723
SIBPRO42                             0.056547
ARTHAGED                             0.052489
MOMPRO42                             0.046814
DADPRO42                             0.046585
                                       ...   
HIBPAGED                             0.035594
EDUYRDG                              0.032535
CHDAGED                              0.025596
TYPEPE42                             0.021714
MIAGED                               0.021324
ANGIAGED                             0.015473
PAPSMR53                             0.012362
CHBMIX42                             0.005131
DSA1C53                              0.004982
SEATBE53                             0.002267
DIABAGED                            -0.002807
ADHECR42                            -0.005005
ADCMPM42                            -0.006289
HOUR31                              -0.014959
ADHDAGED                            -0.018932
CHHECR42                            -0.024422
BRSTEX53                            -0.045626
SGMTST53                            -0.059726
MAMOGR53                            -0.075052
BSTST53                             -0.088379
MARRY31X                            -0.111988
CHECK53                             -0.118724
BPCHEK53                            -0.119744
MCS42                               -0.125736
CHOLCK53                            -0.139350
FLUSHT53                            -0.141169
FAMSZEYR                            -0.146677
PSA53                               -0.149071
CLNTST53                            -0.150298
PCS42                               -0.298220
Name: Total_Expenditure, Length: 66, dtype: float64
In [127]:
#plot corr matrix
corr = corr.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = 1

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(8, 6))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=0.8,vmin=-0.2, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .6})
Out[127]:
<matplotlib.axes._subplots.AxesSubplot at 0x25d80dbd080>
In [86]:
df_base = pd.concat([df_c, df["TOTEXP15"]],axis =1)
df_base = df_base.loc[:,~df_base.columns.duplicated()]
df_base = df_base.loc[(df_base["TOTEXP15" ])]   
#df_xgb_base = df_base
df_xgb_base = df_base.fillna(0)
df_xgb_base  = df_xgb_base.replace([np.inf, -np.inf], 0)
x = df_xgb_base.drop(['TOTEXP15'], axis=1)
y = np.log(df_xgb_base["TOTEXP15"]+1)
 
#x = preprocessing.scale(x)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state = 6)
#model fit  
xgb_model = XGBRegressor()
xgb_m = GridSearchCV(xgb_model,
                   {'max_depth': [3,4,5],
                    'n_estimators': [50,75,100],
                    "learning_rate": [0.1,0.2,0.3]
                   }, verbose=1, n_jobs=-1, cv=3)

xgb_m = xgb_m.fit(X=X_train,y=y_train)
train_mae = mean_absolute_error(y_train, xgb_m.predict(X_train))
test_mae = mean_absolute_error(y_test, xgb_m.predict(X_test))

    #plt1
pred = xgb_m.predict(X_test)
true = y_test
f, ax = plt.subplots(figsize=(6, 6))
ax.scatter(true, pred )
ax.set_xlim([2, 12])
ax.set_ylim([2, 12])
plt.title("Predict vs True")
plt.xlabel("true_value")
plt.ylabel("predicted_value")
ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".2")
plt.show()
    
    #plt2
params = xgb_m.best_params_
model  = XGBRegressor(params["max_depth"], params["learning_rate"], params["n_estimators"])
model.fit(X_train, y_train)
plt.figure()
xgb.plot_importance(model, max_num_features=15)
plt.show()

print (xgb_m.best_params_)
ret={}
ret["test R^2"] = xgb_m.best_score_  
ret["training R^2"] =  xgb_m.score(X=X_train,y=y_train) 
ret["test mae"] = test_mae  
ret["training mae"] =  train_mae   
ret["observations_count"] =  len(y)
print(ret)
'''
for x in ret:
    print(x)
    print("-----")
'''
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: 
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  This is separate from the ipykernel package so we can avoid doing imports until
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  9.8min finished
<matplotlib.figure.Figure at 0x18f00e52dd8>
{'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 100}
{'test R^2': 0.9482604156671363, 'training R^2': 0.9647949400124298, 'test mae': 0.44665195231074023, 'training mae': 0.39143639305303285, 'observations_count': 35427}
test R^2
-----
training R^2
-----
test mae
-----
training mae
-----
observations_count
-----
In [ ]:
## notice self imputed data performs worse than xgb imputed num
In [93]:
df_base = pd.concat([df_preprocessed, df["TOTEXP15"]],axis =1)
df_base = df_base.loc[:,~df_base.columns.duplicated()]
df_base = df_base.loc[(df_base["TOTEXP15" ])]   
df_xgb_base = df_base.fillna(0)
df_xgb_base.replace([np.inf, -np.inf], 0)
y = np.log(df_xgb_base["TOTEXP15"]+1)
x = df_xgb_base.drop(['TOTEXP15'], axis=1)



X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state = 6)
    #model fit  
xgb_model = XGBRegressor()
xgb_m = GridSearchCV(xgb_model,
                   {'max_depth': [3,4,5],
                    'n_estimators': [50,75],
                    "learning_rate": [0.1,0.2,0.3]
                   }, verbose=1, n_jobs=-1, cv=3)

xgb_m = xgb_m.fit(X=X_train,y=y_train)
train_mae = mean_absolute_error(y_train, xgb_m.predict(X_train))
test_mae = mean_absolute_error(y_test, xgb_m.predict(X_test))

    #plt1
pred = xgb_m.predict(X_test)
true = y_test
f, ax = plt.subplots(figsize=(6, 6))
ax.scatter(true, pred )
ax.set_xlim([2, 12])
ax.set_ylim([2, 12])
plt.title("Predict vs True")
plt.xlabel("true_value")
plt.ylabel("predicted_value")
ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".2")
plt.show()
    
    #plt2
params = xgb_m.best_params_
model  = XGBRegressor(params["max_depth"], params["learning_rate"], params["n_estimators"])
model.fit(X_train, y_train)
plt.figure()
xgb.plot_importance(model, max_num_features=15)
plt.show()

print (xgb_m.best_params_)
ret={}
ret["test R^2"] = xgb_m.best_score_  
ret["training R^2"] =  xgb_m.score(X=X_train,y=y_train) 
ret["test mae"] = test_mae  
ret["training mae"] =  train_mae   
ret["observations_count"] =  len(y)
print(ret)
#####
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: 
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  This is separate from the ipykernel package so we can avoid doing imports until
Fitting 3 folds for each of 18 candidates, totalling 54 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 15.8min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed: 20.4min finished
<matplotlib.figure.Figure at 0x18f2d131dd8>
{'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 75}
{'test R^2': 0.9467046386591372, 'training R^2': 0.9527039843932477, 'test mae': 0.5088554615757112, 'training mae': 0.47410870233848884, 'observations_count': 35427}

3. Model fit, diagonosed patient to specific expenditure

In [8]:
# fit model/plot
def DvsE_out_plot(d,e,df_input,df_all):
    '''
    disease vs expenditure;
    input:
        df_input is feature df which contains feacture of interests
        df_all is the whole df for extracting disease and expenditure;
    return:
        (plot feature importance to explore)
        train/ test R^2 and MAE, and sample counts
    '''
    #input
    df_c = df_input
    df=df_all
    df_c = pd.concat([df_c, df[e]],axis =1)
    df_c = df_c.loc[:,~df_c.columns.duplicated()]
    df_cheart = df_c.loc[(df_c[d] == 1) & (df_c[e] > 0) ]   
    
    #preprocess
    y = np.log(df_cheart[e]+1)        
    if len(y) < 10:
        ret = "observation count smaller than 10"
        print(d,"obervation count smaller than 10")
        return ret
    x = df_cheart.drop([e], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 6)
    #model fit  
    xgb_model = XGBRegressor()
    xgb_m = GridSearchCV(xgb_model,
                   {'max_depth': [2,3,5],
                    'n_estimators': [25,45,65],
                    "learning_rate": [0.1,0.2,0.3]
                   }, verbose=1, n_jobs=-1, cv=3)

    xgb_m = xgb_m.fit(X=X_train,y=y_train)
    train_mae = mean_absolute_error(y_train, xgb_m.predict(X_train))
    test_mae = mean_absolute_error(y_test, xgb_m.predict(X_test))

    #plt1
    pred = xgb_m.predict(X_test)
    true = y_test
    f, ax = plt.subplots(figsize=(6, 6))
    ax.scatter(true, pred )
    ax.set_xlim([0, 12])
    ax.set_ylim([0, 12])
    plt.title("Predict vs True")
    plt.xlabel("true_value")
    plt.ylabel("predicted_value")
    ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3")
    plt.show()
    
    #plt2
    params = xgb_m.best_params_
    model  = XGBRegressor(params["max_depth"], params["learning_rate"], params["n_estimators"])
    model.fit(X_train, y_train)
    plt.figure()
    xgb.plot_importance(model, max_num_features=15)
    plt.show()
    #return
    print (xgb_m.best_params_)
    ret={}
    ret["test R^2"] = xgb_m.best_score_  
    #ret["training R^2"] =  xgb_m.score(X=X_train,y=y_train) 
    #ret["test mae"] = test_mae  
    #ret["training mae"] =  train_mae   
    ret["observations_count"] =  len(y)

    return ret
In [8]:
df_input = df_c
df_all = df

coronary heart disease

In [131]:
# explore coronary heart disease vs TOTEXP
DvsE_out_plot('CHDDX','TOTEXP15',df_input,df_all)
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   28.3s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x25a138c9080>
{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 50}
Out[131]:
{'observations_count': 1256,
 'test R^2': 0.5394685907729895,
 'test mae': 0.8961343399092596,
 'training R^2': 0.7048031940884515,
 'training mae': 0.6596149146217025}
In [166]:
DvsE_out('CHDDX','TOTEXP15',df_input,df_all)
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   30.7s finished
Out[166]:
{'observations_count': 1290,
 'test R^2': 0.6983225429845477,
 'test mae': 0.9767107473442648,
 'training R^2': 0.8802261728945496,
 'training mae': 0.563255302464367}
In [132]:
# explore coronary heart disease vs office based exp
DvsE_out_plot('CHDDX','OBVEXP15',df_input,df_all)
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   22.6s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   26.6s finished
<matplotlib.figure.Figure at 0x25a14651780>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 50}
Out[132]:
{'observations_count': 1171,
 'test R^2': 0.723673033334872,
 'test mae': 0.5535039942438549,
 'training R^2': 0.7710698373609801,
 'training mae': 0.490207104241209}
In [133]:
# explore coronary heart disease vs outpatient based exp
DvsE_out_plot('CHDDX','OPVEXP15',df_input,df_all)
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   20.8s finished
<matplotlib.figure.Figure at 0x25a060d1550>
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Out[133]:
{'observations_count': 289,
 'test R^2': -0.02387128174215062,
 'test mae': 1.3885496142580205,
 'training R^2': 0.7780351097943198,
 'training mae': 0.6371446198493126}
In [134]:
# explore coronary heart disease vs outpatient based exp
DvsE_out_plot('CHDDX','RXEXP15',df_input,df_all)
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   26.4s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x25a060f29b0>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 75}
Out[134]:
{'observations_count': 1209,
 'test R^2': 0.6230316135229346,
 'test mae': 0.861622921307236,
 'training R^2': 0.7308444350805714,
 'training mae': 0.7144983086918879}

preganant

In [125]:
# explore coronary heart disease vs outpatient based exp
DvsE_out_plot('PREGNT31','TOTEXP15',df_input,df_all)
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   19.1s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x25a13850ac8>
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Out[125]:
{'observations_count': 380,
 'test R^2': 0.145746931206345,
 'test mae': 0.9674326854001325,
 'training R^2': 0.7142713969507406,
 'training mae': 0.5769362234143682}
In [126]:
# explore coronary heart disease vs outpatient based exp
DvsE_out_plot('PREGNT31','OBVEXP15',df_input,df_all)
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   19.5s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x25a060f2978>
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Out[126]:
{'observations_count': 339,
 'test R^2': 0.4960027080003214,
 'test mae': 0.6744314195696273,
 'training R^2': 0.8515594530367072,
 'training mae': 0.38958074127162934}
In [127]:
# explore coronary heart disease vs outpatient based exp
DvsE_out_plot('PREGNT31','OPVEXP15',df_input,df_all)
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  43 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   16.7s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x25a1374d198>
{'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 25}
Out[127]:
{'observations_count': 50,
 'test R^2': -0.5488691283949321,
 'test mae': 1.0938154801830318,
 'training R^2': 0.9965480757182692,
 'training mae': 0.058604034320640305}
In [128]:
DvsE_out_plot('PREGNT31','RXEXP15',df_input,df_all)
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   18.5s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x25a14fec4e0>
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 80}
Out[128]:
{'observations_count': 275,
 'test R^2': 0.40499188645455075,
 'test mae': 0.9586731793357812,
 'training R^2': 0.933058844977047,
 'training mae': 0.3359852890838969}

classifier

In [11]:
def DvsE_cls(d,e,df_input,df_all):
    #input
    df_c = df_input
    df=df_all
    
    df_c = pd.concat([df_c, df[e]],axis =1)
    df_c = df_c.loc[:,~df_c.columns.duplicated()]
    df_cheart = df_c.loc[(df_c[d] == 1)]
    
    #preprocess
    y = (df_cheart[e])
    #print(y)
    y = pd.cut(y,3, labels=["low","medium","high"])     ## low value vs high value      
    #print(y)
    x = df_cheart.drop([e], axis=1)
    #x = preprocessing.scale(x)
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 6)
    #print(y_train)
    #print(y_test)
    xgb_model = XGBClassifier()
    xgb_m = GridSearchCV(xgb_model,
                   {'max_depth': [3,4,5],
                    'n_estimators': [25,50,75],
                    "learning_rate": [0.1,0.2,0.3]
                   }, verbose=1, n_jobs=-1, cv=3)
    xgb_m = xgb_m.fit(X=X_train,y=y_train)
    test_acc = accuracy_score(y_test, xgb_m.predict(X_test))
    
    ret_dic = {}
    ret_dic["train_acc"] = xgb_m.best_score_
    ret_dic["test_acc"] = test_acc
    ret_dic["parameter"] = xgb_m.best_params_
    print(len(y))
    return ret_dic
In [162]:
print(DvsE_cls('CANCERDX','TOTEXP15',df_c,df))
print(DvsE_cls('CHDDX','OBVEXP15',df_c,df))
Fitting 3 folds for each of 27 candidates, totalling 81 fits
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py:597: Warning: The least populated class in y has only 2 members, which is too few. The minimum number of members in any class cannot be less than n_splits=3.
  % (min_groups, self.n_splits)), Warning)
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   25.8s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
2246
{'train_acc': 0.9988864142538976, 'test_acc': 1.0, 'parameter': {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 25}}
Fitting 3 folds for each of 27 candidates, totalling 81 fits
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py:597: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=3.
  % (min_groups, self.n_splits)), Warning)
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   23.9s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
1290
{'train_acc': 0.999031007751938, 'test_acc': 1.0, 'parameter': {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 25}}

output disease vs expenditure model performance and summary stats

In [9]:
disease_f = ["CANCERDX", "CHDDX","STRKDX","OHRTDX","HIBPDX","EMPHDX","DIABDX",
#cancer/ coronary heart disease/ stroke/ other heart disease/ high blood pressure/ Emphysema肺气肿/Diabetes糖尿病
            "ARTHDX", "ASTHDX", "ADHDADDX","PREGNT31","IADLHP31"]
#           arthritis关节炎/asthma/ 多动症/ pregnant / independent living screener


exp_l = ["TOTEXP15", "OPTEXP15", "OBVEXP15",   "HHAEXP15",               "ERTEXP15","IPTEXP15", "RXEXP15"]
In [10]:
def summary_y(d,e,df_input,df_all):
    '''
    disease vs expenditure;
    df_input is feature df
    df_all is the whole df for extracting d and e;
    return train/ test R^2 and MAE.
    '''
    #input
    df_c = df_input
    df=df_all
    df_c = pd.concat([df_c, df[e]],axis =1)
    df_c = df_c.loc[:,~df_c.columns.duplicated()]
    df_cheart = df_c.loc[(df_c[d] == 1) & (df_c[e] >0)]
    #preprocess
    y =    df_cheart[e]
    #plt.figure()
   # sns.distplot(y)
   # plt.show()   
    
    plt.figure()
    sns.distplot(np.log(y+10))
    plt.show() 
    if len(y) < 10:
        ret = "observation count smaller than 10"
        print(d,"obervation count smaller than 10")
        return ret
    x = df_cheart.drop([e], axis=1)

    return y.describe()
In [11]:
#emphdx hhaexp
#
out_dic_summary ={}
n=0
for d in disease_f:
    out_dic_summary[d] = {}
    for e in exp_l:
        print("----------------------------------")
        print(n)
        print(d)
        print(e)
        n+=1
        
        try :
            out_dic_summary[d][e] = summary_y(d,e,df_c,df)
            print(out_dic_summary[d][e])
        except:
            out_dic[d][e] = "NaN"
outt_df_summary = pd.DataFrame(out_dic_summary)
outt_df_summary = outt_df_summary.transpose()
outt_df_summary.to_excel("MEPS_Disease_Expenditure_stats_summary_7.12.xlsx")
----------------------------------
0
CANCERDX
TOTEXP15
count      2170.000000
mean      13860.709217
std       29427.119742
min           4.000000
25%        1931.500000
50%        5179.000000
75%       14308.500000
max      700771.000000
Name: TOTEXP15, dtype: float64
----------------------------------
1
CANCERDX
OPTEXP15
count      752.000000
mean      3691.710106
std       8639.432302
min          3.000000
25%        268.500000
50%        929.500000
75%       3223.250000
max      84746.000000
Name: OPTEXP15, dtype: float64
----------------------------------
2
CANCERDX
OBVEXP15
count      2042.000000
mean       3684.469148
std        8307.536792
min           3.000000
25%         563.500000
50%        1482.000000
75%        3598.750000
max      178468.000000
Name: OBVEXP15, dtype: float64
----------------------------------
3
CANCERDX
HHAEXP15
count       220.000000
mean       8100.122727
std       19657.491202
min          34.000000
25%        1162.750000
50%        3020.500000
75%        8472.750000
max      198046.000000
Name: HHAEXP15, dtype: float64
----------------------------------
4
CANCERDX
ERTEXP15
count      518.000000
mean      1445.602317
std       2341.091318
min          6.000000
25%        301.500000
50%        723.000000
75%       1551.500000
max      28333.000000
Name: ERTEXP15, dtype: float64
----------------------------------
5
CANCERDX
IPTEXP15
count       393.000000
mean      24797.376590
std       46807.703342
min          38.000000
25%        6329.000000
50%       12477.000000
75%       28994.000000
max      663917.000000
Name: IPTEXP15, dtype: float64
----------------------------------
6
CANCERDX
RXEXP15
count     2013.000000
mean      3015.944858
std       6779.884382
min          1.000000
25%        228.000000
50%        848.000000
75%       2946.000000
max      95671.000000
Name: RXEXP15, dtype: float64
----------------------------------
7
CHDDX
TOTEXP15
count      1256.000000
mean      18183.140924
std       33581.368129
min          10.000000
25%        2638.750000
50%        7617.000000
75%       20967.000000
max      608264.000000
Name: TOTEXP15, dtype: float64
----------------------------------
8
CHDDX
OPTEXP15
count      453.000000
mean      3189.139073
std       8223.363006
min          3.000000
25%        244.000000
50%        759.000000
75%       2393.000000
max      83416.000000
Name: OPTEXP15, dtype: float64
----------------------------------
9
CHDDX
OBVEXP15
count      1171.000000
mean       3489.941076
std       16474.322768
min           2.000000
25%         466.500000
50%        1311.000000
75%        2990.000000
max      526744.000000
Name: OBVEXP15, dtype: float64
----------------------------------
10
CHDDX
HHAEXP15
count       215.000000
mean      10255.213953
std       18362.060470
min          51.000000
25%        1620.000000
50%        4212.000000
75%       12360.500000
max      155022.000000
Name: HHAEXP15, dtype: float64
----------------------------------
11
CHDDX
ERTEXP15
count      398.000000
mean      1614.776382
std       2973.113058
min          5.000000
25%        276.000000
50%        691.500000
75%       1597.500000
max      28333.000000
Name: ERTEXP15, dtype: float64
----------------------------------
12
CHDDX
IPTEXP15
count       334.000000
mean      25626.676647
std       34779.441283
min          25.000000
25%        7470.250000
50%       13992.000000
75%       31750.750000
max      430298.000000
Name: IPTEXP15, dtype: float64
----------------------------------
13
CHDDX
RXEXP15
count      1209.000000
mean       4141.463193
std        7455.813894
min           1.000000
25%         534.000000
50%        1781.000000
75%        4791.000000
max      129817.000000
Name: RXEXP15, dtype: float64
----------------------------------
14
STRKDX
TOTEXP15
count      1013.000000
mean      19365.348470
std       36622.614907
min           5.000000
25%        2443.000000
50%        7470.000000
75%       21056.000000
max      530792.000000
Name: TOTEXP15, dtype: float64
----------------------------------
15
STRKDX
OPTEXP15
count      316.000000
mean      2907.164557
std       7761.035309
min          3.000000
25%        235.500000
50%        730.000000
75%       2229.250000
max      83416.000000
Name: OPTEXP15, dtype: float64
----------------------------------
16
STRKDX
OBVEXP15
count      934.000000
mean      3112.146681
std       6426.993434
min          3.000000
25%        428.000000
50%       1062.500000
75%       2979.000000
max      69537.000000
Name: OBVEXP15, dtype: float64
----------------------------------
17
STRKDX
HHAEXP15
count       190.000000
mean       8719.394737
std       15522.360082
min          51.000000
25%        1315.750000
50%        3720.500000
75%       10183.750000
max      131888.000000
Name: HHAEXP15, dtype: float64
----------------------------------
18
STRKDX
ERTEXP15
count      365.000000
mean      1598.068493
std       2751.344608
min         14.000000
25%        322.000000
50%        742.000000
75%       1598.000000
max      22870.000000
Name: ERTEXP15, dtype: float64
----------------------------------
19
STRKDX
IPTEXP15
count       283.000000
mean      29486.459364
std       45988.863797
min          18.000000
25%        6677.500000
50%       13429.000000
75%       31687.000000
max      430298.000000
Name: IPTEXP15, dtype: float64
----------------------------------
20
STRKDX
RXEXP15
count       964.000000
mean       4739.189834
std       15418.300506
min           4.000000
25%         523.000000
50%        1803.000000
75%        4916.250000
max      414941.000000
Name: RXEXP15, dtype: float64
----------------------------------
21
OHRTDX
TOTEXP15
count      2370.000000
mean      14087.199156
std       27857.680988
min           4.000000
25%        1776.250000
50%        5378.000000
75%       15953.250000
max      608264.000000
Name: TOTEXP15, dtype: float64
----------------------------------
22
OHRTDX
OPTEXP15
count       837.000000
mean       2819.206691
std        7531.657360
min           3.000000
25%         209.000000
50%         670.000000
75%        2258.000000
max      121690.000000
Name: OPTEXP15, dtype: float64
----------------------------------
23
OHRTDX
OBVEXP15
count      2199.000000
mean       3066.876762
std       12422.503470
min           6.000000
25%         451.000000
50%        1206.000000
75%        2936.500000
max      526744.000000
Name: OBVEXP15, dtype: float64
----------------------------------
24
OHRTDX
HHAEXP15
count       271.000000
mean       7334.011070
std       11537.695794
min          34.000000
25%        1307.000000
50%        3074.000000
75%        9944.000000
max      131888.000000
Name: HHAEXP15, dtype: float64
----------------------------------
25
OHRTDX
ERTEXP15
count      695.000000
mean      1716.684892
std       3441.602339
min          5.000000
25%        323.500000
50%        741.000000
75%       1757.500000
max      55261.000000
Name: ERTEXP15, dtype: float64
----------------------------------
26
OHRTDX
IPTEXP15
count       483.000000
mean      24013.213251
std       33380.971041
min          25.000000
25%        6856.500000
50%       13092.000000
75%       29837.000000
max      430298.000000
Name: IPTEXP15, dtype: float64
----------------------------------
27
OHRTDX
RXEXP15
count      2192.000000
mean       3597.596715
std        7410.696040
min           1.000000
25%         302.750000
50%        1247.000000
75%        3935.500000
max      168967.000000
Name: RXEXP15, dtype: float64
----------------------------------
28
HIBPDX
TOTEXP15
count      7890.000000
mean      10035.796198
std       22939.658200
min           1.000000
25%        1052.250000
50%        3283.500000
75%        9695.000000
max      700771.000000
Name: TOTEXP15, dtype: float64
----------------------------------
29
HIBPDX
OPTEXP15
count      2055.000000
mean       2667.537226
std        7610.746933
min           3.000000
25%         191.000000
50%         642.000000
75%        2245.500000
max      134843.000000
Name: OPTEXP15, dtype: float64
----------------------------------
30
HIBPDX
OBVEXP15
count      7125.000000
mean       2276.283088
std        8277.620600
min           1.000000
25%         316.000000
50%         822.000000
75%        2149.000000
max      526744.000000
Name: OBVEXP15, dtype: float64
----------------------------------
31
HIBPDX
HHAEXP15
count       629.000000
mean       8803.821940
std       17403.235013
min          39.000000
25%        1287.000000
50%        3602.000000
75%        9706.000000
max      198046.000000
Name: HHAEXP15, dtype: float64
----------------------------------
32
HIBPDX
ERTEXP15
count      1734.000000
mean       1593.212803
std        3699.818870
min           5.000000
25%         302.250000
50%         696.500000
75%        1578.500000
max      104277.000000
Name: ERTEXP15, dtype: float64
----------------------------------
33
HIBPDX
IPTEXP15
count      1087.000000
mean      21975.540938
std       34268.497184
min          25.000000
25%        6098.500000
50%       12407.000000
75%       27130.000000
max      663917.000000
Name: IPTEXP15, dtype: float64
----------------------------------
34
HIBPDX
RXEXP15
count      7349.000000
mean       2899.065451
std        8656.294799
min           1.000000
25%         185.000000
50%         759.000000
75%        2844.000000
max      414941.000000
Name: RXEXP15, dtype: float64
----------------------------------
35
EMPHDX
TOTEXP15
count       452.000000
mean      16158.426991
std       28147.618666
min          20.000000
25%        2410.000000
50%        7032.000000
75%       20551.000000
max      423121.000000
Name: TOTEXP15, dtype: float64
----------------------------------
36
EMPHDX
OPTEXP15
count      183.000000
mean      1770.688525
std       3050.548179
min         13.000000
25%        194.000000
50%        571.000000
75%       1683.500000
max      18505.000000
Name: OPTEXP15, dtype: float64
----------------------------------
37
EMPHDX
OBVEXP15
count      415.000000
mean      2546.028916
std       4680.951104
min         19.000000
25%        461.000000
50%       1163.000000
75%       2788.500000
max      51870.000000
Name: OBVEXP15, dtype: float64
----------------------------------
38
EMPHDX
HHAEXP15
count       86.000000
mean      5912.674419
std       7714.773504
min         49.000000
25%        957.750000
50%       2459.500000
75%       9004.000000
max      43515.000000
Name: HHAEXP15, dtype: float64
----------------------------------
39
EMPHDX
ERTEXP15
count      158.000000
mean      2073.993671
std       4240.298174
min          5.000000
25%        347.750000
50%        780.500000
75%       1592.750000
max      28333.000000
Name: ERTEXP15, dtype: float64
----------------------------------
40
EMPHDX
IPTEXP15
count       116.000000
mean      19466.327586
std       21620.414386
min         161.000000
25%        6592.000000
50%       11477.000000
75%       21997.750000
max      114057.000000
Name: IPTEXP15, dtype: float64
----------------------------------
41
EMPHDX
RXEXP15
count       438.000000
mean       5902.748858
std       21473.491800
min           1.000000
25%         553.500000
50%        2083.500000
75%        5857.250000
max      414941.000000
Name: RXEXP15, dtype: float64
----------------------------------
42
DIABDX
TOTEXP15
count      2715.000000
mean      13188.394843
std       26790.965091
min           1.000000
25%        1869.000000
50%        5387.000000
75%       14334.500000
max      608264.000000
Name: TOTEXP15, dtype: float64
----------------------------------
43
DIABDX
OPTEXP15
count       753.000000
mean       2779.197875
std        8801.178013
min           3.000000
25%         211.000000
50%         645.000000
75%        2208.000000
max      121690.000000
Name: OPTEXP15, dtype: float64
----------------------------------
44
DIABDX
OBVEXP15
count      2510.000000
mean       2589.939044
std       12118.552347
min           7.000000
25%         366.250000
50%         931.000000
75%        2332.500000
max      526744.000000
Name: OBVEXP15, dtype: float64
----------------------------------
45
DIABDX
HHAEXP15
count       291.000000
mean       7957.460481
std       13892.233436
min          55.000000
25%        1374.500000
50%        3760.000000
75%        9372.500000
max      155022.000000
Name: HHAEXP15, dtype: float64
----------------------------------
46
DIABDX
ERTEXP15
count      627.000000
mean      1359.397129
std       2092.706441
min          9.000000
25%        295.000000
50%        684.000000
75%       1517.500000
max      24004.000000
Name: ERTEXP15, dtype: float64
----------------------------------
47
DIABDX
IPTEXP15
count       444.000000
mean      23023.405405
std       31134.521241
min          25.000000
25%        6582.250000
50%       13101.500000
75%       29035.000000
max      430298.000000
Name: IPTEXP15, dtype: float64
----------------------------------
48
DIABDX
RXEXP15
count      2635.000000
mean       4740.054649
std       11247.662413
min           1.000000
25%         529.000000
50%        2023.000000
75%        5578.500000
max      414941.000000
Name: RXEXP15, dtype: float64
----------------------------------
49
ARTHDX
TOTEXP15
count      6066.000000
mean      11486.399275
std       21757.133632
min           1.000000
25%        1505.750000
50%        4526.000000
75%       12293.500000
max      530792.000000
Name: TOTEXP15, dtype: float64
----------------------------------
50
ARTHDX
OPTEXP15
count      1961.000000
mean       2818.145334
std        7604.901311
min           3.000000
25%         230.000000
50%         736.000000
75%        2447.000000
max      134843.000000
Name: OPTEXP15, dtype: float64
----------------------------------
51
ARTHDX
OBVEXP15
count      5680.000000
mean       2613.722535
std        5784.141261
min           1.000000
25%         421.750000
50%        1101.000000
75%        2739.000000
max      178468.000000
Name: OBVEXP15, dtype: float64
----------------------------------
52
ARTHDX
HHAEXP15
count       566.000000
mean       8076.745583
std       14576.718757
min          34.000000
25%        1298.500000
50%        3534.000000
75%        9488.250000
max      172690.000000
Name: HHAEXP15, dtype: float64
----------------------------------
53
ARTHDX
ERTEXP15
count      1473.000000
mean       1673.126273
std        4156.105029
min           5.000000
25%         324.000000
50%         741.000000
75%        1596.000000
max      104277.000000
Name: ERTEXP15, dtype: float64
----------------------------------
54
ARTHDX
IPTEXP15
count       906.000000
mean      21866.246137
std       29483.166271
min          18.000000
25%        6415.750000
50%       12377.000000
75%       26559.750000
max      430298.000000
Name: IPTEXP15, dtype: float64
----------------------------------
55
ARTHDX
RXEXP15
count      5674.000000
mean       3331.930208
std        9076.192665
min           1.000000
25%         224.000000
50%         956.000000
75%        3403.750000
max      414941.000000
Name: RXEXP15, dtype: float64
----------------------------------
56
ASTHDX
TOTEXP15
count      3143.000000
mean       7546.701559
std       17931.892908
min           1.000000
25%         569.000000
50%        2127.000000
75%        7115.000000
max      530792.000000
Name: TOTEXP15, dtype: float64
----------------------------------
57
ASTHDX
OPTEXP15
count      662.000000
mean      2664.070997
std       7229.006612
min          3.000000
25%        180.250000
50%        600.000000
75%       2360.000000
max      85911.000000
Name: OPTEXP15, dtype: float64
----------------------------------
58
ASTHDX
OBVEXP15
count     2735.000000
mean      1822.607678
std       4169.829418
min          1.000000
25%        208.500000
50%        610.000000
75%       1784.000000
max      68009.000000
Name: OBVEXP15, dtype: float64
----------------------------------
59
ASTHDX
HHAEXP15
count       157.000000
mean       8355.789809
std       18413.400751
min          39.000000
25%        1163.000000
50%        3564.000000
75%        9804.000000
max      201150.000000
Name: HHAEXP15, dtype: float64
----------------------------------
60
ASTHDX
ERTEXP15
count      790.000000
mean      1522.732911
std       3030.114536
min          9.000000
25%        286.000000
50%        678.000000
75%       1507.250000
max      39471.000000
Name: ERTEXP15, dtype: float64
----------------------------------
61
ASTHDX
IPTEXP15
count       332.000000
mean      17495.921687
std       30839.353171
min          53.000000
25%        4484.000000
50%        9282.000000
75%       19336.000000
max      430298.000000
Name: IPTEXP15, dtype: float64
----------------------------------
62
ASTHDX
RXEXP15
count      2658.000000
mean       2730.486080
std        6993.978741
min           1.000000
25%         110.000000
50%         571.000000
75%        2589.500000
max      171569.000000
Name: RXEXP15, dtype: float64
----------------------------------
63
ADHDADDX
TOTEXP15
count       730.000000
mean       4047.328767
std        8190.232277
min           5.000000
25%         531.000000
50%        1612.500000
75%        3938.000000
max      111476.000000
Name: TOTEXP15, dtype: float64
----------------------------------
64
ADHDADDX
OPTEXP15
count       75.000000
mean      2672.640000
std      10563.373175
min         17.000000
25%         97.000000
50%        233.000000
75%       1116.000000
max      85911.000000
Name: OPTEXP15, dtype: float64
----------------------------------
65
ADHDADDX
OBVEXP15
count      636.000000
mean      1209.545597
std       3347.353161
min          7.000000
25%        168.750000
50%        415.500000
75%       1061.500000
max      61928.000000
Name: OBVEXP15, dtype: float64
----------------------------------
66
ADHDADDX
HHAEXP15
count       31.000000
mean      8670.548387
std       9239.412704
min        134.000000
25%       1745.500000
50%       6301.000000
75%      11243.000000
max      35184.000000
Name: HHAEXP15, dtype: float64
----------------------------------
67
ADHDADDX
ERTEXP15
count     118.000000
mean      864.508475
std      1324.954795
min        34.000000
25%       208.000000
50%       398.000000
75%       826.500000
max      7693.000000
Name: ERTEXP15, dtype: float64
----------------------------------
68
ADHDADDX
IPTEXP15
count       26.000000
mean      9522.269231
std       7800.545050
min        413.000000
25%       3717.500000
50%       6905.500000
75%      13262.000000
max      25870.000000
Name: IPTEXP15, dtype: float64
----------------------------------
69
ADHDADDX
RXEXP15
count      564.000000
mean      1809.177305
std       2916.104612
min          1.000000
25%        160.750000
50%        833.500000
75%       2366.750000
max      25642.000000
Name: RXEXP15, dtype: float64
----------------------------------
70
PREGNT31
TOTEXP15
count       380.000000
mean      10816.586842
std       12055.271392
min          24.000000
25%        3200.000000
50%        8047.000000
75%       13767.000000
max      114262.000000
Name: TOTEXP15, dtype: float64
----------------------------------
71
PREGNT31
OPTEXP15
count       88.00000
mean      1605.75000
std       2450.15173
min          8.00000
25%        216.75000
50%        620.50000
75%       1919.00000
max      11867.00000
Name: OPTEXP15, dtype: float64
----------------------------------
72
PREGNT31
OBVEXP15
count      339.000000
mean      2268.365782
std       4189.201411
min         24.000000
25%        424.500000
50%       1184.000000
75%       2434.000000
max      44155.000000
Name: OBVEXP15, dtype: float64
----------------------------------
73
PREGNT31
HHAEXP15
PREGNT31 obervation count smaller than 10
observation count smaller than 10
----------------------------------
74
PREGNT31
ERTEXP15
count      113.000000
mean      1600.646018
std       2966.442178
min          2.000000
25%        260.000000
50%        583.000000
75%       1737.000000
max      19418.000000
Name: ERTEXP15, dtype: float64
----------------------------------
75
PREGNT31
IPTEXP15
count       269.000000
mean      10579.918216
std       11132.914936
min          34.000000
25%        4514.000000
50%        7494.000000
75%       11971.000000
max      111309.000000
Name: IPTEXP15, dtype: float64
----------------------------------
76
PREGNT31
RXEXP15
count      275.000000
mean       386.683636
std       1741.876754
min          1.000000
25%         20.500000
50%         62.000000
75%        203.500000
max      26507.000000
Name: RXEXP15, dtype: float64
----------------------------------
77
IADLHP31
TOTEXP15
count      1052.000000
mean      22445.936312
std       32854.387304
min           1.000000
25%        3463.750000
50%       11281.000000
75%       29199.500000
max      423121.000000
Name: TOTEXP15, dtype: float64
----------------------------------
78
IADLHP31
OPTEXP15
count       327.000000
mean       3528.155963
std        9950.472563
min           4.000000
25%         260.000000
50%         759.000000
75%        2660.500000
max      121690.000000
Name: OPTEXP15, dtype: float64
----------------------------------
79
IADLHP31
OBVEXP15
count       980.000000
mean       3483.918367
std        9126.258130
min           3.000000
25%         474.500000
50%        1225.000000
75%        3010.000000
max      178468.000000
Name: OBVEXP15, dtype: float64
----------------------------------
80
IADLHP31
HHAEXP15
count       398.000000
mean      12559.095477
std       20625.584328
min          34.000000
25%        2406.750000
50%        6692.000000
75%       14282.000000
max      198046.000000
Name: HHAEXP15, dtype: float64
----------------------------------
81
IADLHP31
ERTEXP15
count      374.000000
mean      1554.050802
std       2873.757698
min         11.000000
25%        317.250000
50%        797.000000
75%       1592.000000
max      22870.000000
Name: ERTEXP15, dtype: float64
----------------------------------
82
IADLHP31
IPTEXP15
count       279.000000
mean      23949.523297
std       27696.956143
min         161.000000
25%        7240.500000
50%       14029.000000
75%       29195.500000
max      200774.000000
Name: IPTEXP15, dtype: float64
----------------------------------
83
IADLHP31
RXEXP15
count       995.000000
mean       5895.928643
std       16233.589365
min           1.000000
25%         622.500000
50%        2350.000000
75%        6066.000000
max      414941.000000
Name: RXEXP15, dtype: float64
In [12]:
#####
#out model performance
out_dic_summary ={}
n=0
for d in disease_f:
    out_dic_summary[d] = {}
    for e in exp_l:
        print("----------------------------------")
        print(n)
        print(d)
        print(e)
        n+=1
        try :
            out_dic_summary[d][e] = DvsE_out_plot(d,e,df_c,df)
            print(out_dic_summary[d][e])
        except:
            out_dic_summary[d][e] = "NaN"
outt_df_summary = pd.DataFrame(out_dic_summary)
outt_df_summary = outt_df_summary.transpose()
----------------------------------
0
CANCERDX
TOTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   25.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   38.4s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805911fd0>
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 65}
{'test R^2': 0.6286985284841843, 'observations_count': 2170}
----------------------------------
1
CANCERDX
OPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   27.2s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   32.1s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22814c926d8>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': 0.3383718065347896, 'observations_count': 752}
----------------------------------
2
CANCERDX
OBVEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   27.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   35.7s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22814de0b38>
{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.7063719145060977, 'observations_count': 2042}
----------------------------------
3
CANCERDX
HHAEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   20.6s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x228059622e8>
{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 25}
{'test R^2': -0.020128457592152375, 'observations_count': 220}
----------------------------------
4
CANCERDX
ERTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   23.2s finished
<matplotlib.figure.Figure at 0x2280596e8d0>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.14755894597245967, 'observations_count': 518}
----------------------------------
5
CANCERDX
IPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   20.5s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805964ef0>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.05285048065057457, 'observations_count': 393}
----------------------------------
6
CANCERDX
RXEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   22.2s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   30.1s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22833d79c88>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': 0.6473263203611558, 'observations_count': 2013}
----------------------------------
7
CHDDX
TOTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   27.7s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x2282f41a8d0>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': 0.5360417406505904, 'observations_count': 1256}
----------------------------------
8
CHDDX
OPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   20.9s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805967d68>
{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 25}
{'test R^2': 0.1896683581757754, 'observations_count': 453}
----------------------------------
9
CHDDX
OBVEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.2s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   30.6s finished
<matplotlib.figure.Figure at 0x22814c922b0>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.7227387346499604, 'observations_count': 1171}
----------------------------------
10
CHDDX
HHAEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   17.3s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22812b2ceb8>
{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 45}
{'test R^2': 0.09459193934095644, 'observations_count': 215}
----------------------------------
11
CHDDX
ERTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   19.6s finished
<matplotlib.figure.Figure at 0x22814d68eb8>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.052606537108156805, 'observations_count': 398}
----------------------------------
12
CHDDX
IPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   20.3s finished
<matplotlib.figure.Figure at 0x22805c97cf8>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': -0.12008597168092905, 'observations_count': 334}
----------------------------------
13
CHDDX
RXEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   24.7s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805975eb8>
{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 25}
{'test R^2': 0.6221564075688459, 'observations_count': 1209}
----------------------------------
14
STRKDX
TOTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   26.4s finished
<matplotlib.figure.Figure at 0x228058d0e80>
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 65}
{'test R^2': 0.5138954539041255, 'observations_count': 1013}
----------------------------------
15
STRKDX
OPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   20.1s finished
<matplotlib.figure.Figure at 0x22812bcc978>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': 0.2731498828417156, 'observations_count': 316}
----------------------------------
16
STRKDX
OBVEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   27.2s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   32.5s finished
<matplotlib.figure.Figure at 0x22805a85780>
{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 25}
{'test R^2': 0.7116788879953767, 'observations_count': 934}
----------------------------------
17
STRKDX
HHAEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done  74 out of  81 | elapsed:   19.8s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   20.1s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22812bcce10>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.02460500511849435, 'observations_count': 190}
----------------------------------
18
STRKDX
ERTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   20.9s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805911b00>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.045730065081323844, 'observations_count': 365}
----------------------------------
19
STRKDX
IPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done  74 out of  81 | elapsed:   19.1s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   19.5s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x228228752e8>
{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 25}
{'test R^2': -0.06049915194039986, 'observations_count': 283}
----------------------------------
20
STRKDX
RXEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   21.4s finished
<matplotlib.figure.Figure at 0x2282cbcc320>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': 0.5459588294438087, 'observations_count': 964}
----------------------------------
21
OHRTDX
TOTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   22.6s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   30.2s finished
<matplotlib.figure.Figure at 0x22814ca8d68>
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 65}
{'test R^2': 0.6226687814460312, 'observations_count': 2370}
----------------------------------
22
OHRTDX
OPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   20.6s finished
<matplotlib.figure.Figure at 0x22805911fd0>
{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 25}
{'test R^2': 0.2592469030380532, 'observations_count': 837}
----------------------------------
23
OHRTDX
OBVEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   29.3s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22834558c88>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': 0.7022967995308363, 'observations_count': 2199}
----------------------------------
24
OHRTDX
HHAEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   18.0s finished
<matplotlib.figure.Figure at 0x22805c85eb8>
{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 25}
{'test R^2': -0.10659922875003598, 'observations_count': 271}
----------------------------------
25
OHRTDX
ERTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   19.9s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x228059679e8>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.21376003067320645, 'observations_count': 695}
----------------------------------
26
OHRTDX
IPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   19.7s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805975780>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': -0.12591330404026868, 'observations_count': 483}
----------------------------------
27
OHRTDX
RXEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   27.6s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22814c92ef0>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': 0.653824134572093, 'observations_count': 2192}
----------------------------------
28
HIBPDX
TOTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   37.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  1.0min finished
<matplotlib.figure.Figure at 0x22805a969e8>
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 65}
{'test R^2': 0.6475766947608681, 'observations_count': 7890}
----------------------------------
29
HIBPDX
OPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   24.4s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   37.2s finished
<matplotlib.figure.Figure at 0x22805b31d68>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': 0.32744456269473565, 'observations_count': 2055}
----------------------------------
30
HIBPDX
OBVEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   41.3s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  1.1min finished
<matplotlib.figure.Figure at 0x22814ca8c50>
{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.7308902372648475, 'observations_count': 7125}
----------------------------------
31
HIBPDX
HHAEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.6s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   26.9s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22814d2f278>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.1820695087125828, 'observations_count': 629}
----------------------------------
32
HIBPDX
ERTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   27.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   37.4s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x2282ce4c358>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.17038827902805267, 'observations_count': 1734}
----------------------------------
33
HIBPDX
IPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   34.0s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22812c05a20>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.04513485749114331, 'observations_count': 1087}
----------------------------------
34
HIBPDX
RXEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   41.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  1.1min finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22814cbf588>
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 65}
{'test R^2': 0.6568145365830241, 'observations_count': 7349}
----------------------------------
35
EMPHDX
TOTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   22.4s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22814ed8d30>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': 0.5773840634886428, 'observations_count': 452}
----------------------------------
36
EMPHDX
OPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done  74 out of  81 | elapsed:   20.8s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   21.1s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805962630>
{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 25}
{'test R^2': 0.03250125924900528, 'observations_count': 183}
----------------------------------
37
EMPHDX
OBVEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   22.0s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805c7ea20>
{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 25}
{'test R^2': 0.6981488806773553, 'observations_count': 415}
----------------------------------
38
EMPHDX
HHAEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   19.2s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22814d2f0f0>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': -0.051425702670195875, 'observations_count': 86}
----------------------------------
39
EMPHDX
ERTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   18.4s finished
<matplotlib.figure.Figure at 0x22805964240>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.0033641775920118335, 'observations_count': 158}
----------------------------------
40
EMPHDX
IPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done  74 out of  81 | elapsed:   17.3s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   17.5s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805911c18>
{'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 45}
{'test R^2': -0.2883481071827206, 'observations_count': 116}
----------------------------------
41
EMPHDX
RXEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   19.4s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22814ed8550>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.5387525496981884, 'observations_count': 438}
----------------------------------
42
DIABDX
TOTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   27.6s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   38.0s finished
<matplotlib.figure.Figure at 0x22805962fd0>
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 65}
{'test R^2': 0.6071696018027921, 'observations_count': 2715}
----------------------------------
43
DIABDX
OPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   28.2s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22812c05588>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.19517234922617485, 'observations_count': 753}
----------------------------------
44
DIABDX
OBVEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   27.4s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   36.5s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805964b00>
{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 25}
{'test R^2': 0.7480884436078545, 'observations_count': 2510}
----------------------------------
45
DIABDX
HHAEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   20.1s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805967a58>
{'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 45}
{'test R^2': -0.03896412586810015, 'observations_count': 291}
----------------------------------
46
DIABDX
ERTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   25.4s finished
<matplotlib.figure.Figure at 0x228345589e8>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.08882232195807088, 'observations_count': 627}
----------------------------------
47
DIABDX
IPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   24.6s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   30.1s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22814cf4f60>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': -0.06386347782503794, 'observations_count': 444}
----------------------------------
48
DIABDX
RXEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   33.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   42.6s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805b31048>
{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.5722083760468746, 'observations_count': 2635}
----------------------------------
49
ARTHDX
TOTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   35.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   54.2s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805983a20>
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 65}
{'test R^2': 0.6327692365464965, 'observations_count': 6066}
----------------------------------
50
ARTHDX
OPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   33.4s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805962048>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': 0.2934046889000846, 'observations_count': 1961}
----------------------------------
51
ARTHDX
OBVEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   36.2s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   57.3s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22814dcd4e0>
{'learning_rate': 0.3, 'max_depth': 2, 'n_estimators': 25}
{'test R^2': 0.7301739638008115, 'observations_count': 5680}
----------------------------------
52
ARTHDX
HHAEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   35.0s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22812c05898>
{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 25}
{'test R^2': 0.10258999584097604, 'observations_count': 566}
----------------------------------
53
ARTHDX
ERTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   29.4s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805983898>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.15314662245698937, 'observations_count': 1473}
----------------------------------
54
ARTHDX
IPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   24.9s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805962208>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.029579485557979804, 'observations_count': 906}
----------------------------------
55
ARTHDX
RXEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   37.1s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   55.9s finished
<matplotlib.figure.Figure at 0x22814d2f940>
{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.6533915025585435, 'observations_count': 5674}
----------------------------------
56
ASTHDX
TOTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.5s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   38.6s finished
<matplotlib.figure.Figure at 0x2282f7eff28>
{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': 0.6798876994528665, 'observations_count': 3143}
----------------------------------
57
ASTHDX
OPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   23.6s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x2280596ebe0>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.18892823598309935, 'observations_count': 662}
----------------------------------
58
ASTHDX
OBVEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   36.7s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22814d2f208>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': 0.744791607213207, 'observations_count': 2735}
----------------------------------
59
ASTHDX
HHAEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.5s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   22.7s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22814dcd9b0>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': -0.05110822652670419, 'observations_count': 157}
----------------------------------
60
ASTHDX
ERTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   24.0s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22814d34588>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.24586506944598147, 'observations_count': 790}
----------------------------------
61
ASTHDX
IPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   20.2s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22812aa59b0>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': -0.151960226274786, 'observations_count': 332}
----------------------------------
62
ASTHDX
RXEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   27.6s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   37.9s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22814d68f98>
{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': 0.7307803314062442, 'observations_count': 2658}
----------------------------------
63
ADHDADDX
TOTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   21.0s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805964828>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': 0.6330096444684841, 'observations_count': 730}
----------------------------------
64
ADHDADDX
OPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done  74 out of  81 | elapsed:   19.3s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   19.4s finished
<matplotlib.figure.Figure at 0x2282cbcc320>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': -0.08747159062936571, 'observations_count': 75}
----------------------------------
65
ADHDADDX
OBVEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.1s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   22.6s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22834558b70>
{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 25}
{'test R^2': 0.7578045118312053, 'observations_count': 636}
----------------------------------
66
ADHDADDX
HHAEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   17.1s finished
<matplotlib.figure.Figure at 0x22814cf4b00>
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 45}
{'test R^2': -1.7133391151223654, 'observations_count': 31}
----------------------------------
67
ADHDADDX
ERTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  43 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   18.6s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805c855c0>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': -0.16494710089924713, 'observations_count': 118}
----------------------------------
68
ADHDADDX
IPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  43 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   17.3s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x228058d05c0>
{'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 65}
{'test R^2': -1.913128344789796, 'observations_count': 26}
----------------------------------
69
ADHDADDX
RXEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   21.2s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805a95fd0>
{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 25}
{'test R^2': 0.623441656134367, 'observations_count': 564}
----------------------------------
70
PREGNT31
TOTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   21.1s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22812be2668>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.19070000324655695, 'observations_count': 380}
----------------------------------
71
PREGNT31
OPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done  74 out of  81 | elapsed:   17.6s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   17.6s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805a952e8>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': 0.14946107830462685, 'observations_count': 88}
----------------------------------
72
PREGNT31
OBVEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   19.3s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805a852e8>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.5287573661487349, 'observations_count': 339}
----------------------------------
73
PREGNT31
HHAEXP15
PREGNT31 obervation count smaller than 10
observation count smaller than 10
----------------------------------
74
PREGNT31
ERTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   20.3s finished
<matplotlib.figure.Figure at 0x22814de04e0>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': -0.13772510936740923, 'observations_count': 113}
----------------------------------
75
PREGNT31
IPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   18.6s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805c7e898>
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 45}
{'test R^2': -0.07436202936019726, 'observations_count': 269}
----------------------------------
76
PREGNT31
RXEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   19.4s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22812b2c0f0>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.41493378484885285, 'observations_count': 275}
----------------------------------
77
IADLHP31
TOTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   22.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   28.9s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22812be8518>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': 0.4640830010310039, 'observations_count': 1052}
----------------------------------
78
IADLHP31
OPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   25.4s finished
<matplotlib.figure.Figure at 0x22814cbfa90>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.2631459892661853, 'observations_count': 327}
----------------------------------
79
IADLHP31
OBVEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   30.8s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22805b31c18>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': 0.7154245599230405, 'observations_count': 980}
----------------------------------
80
IADLHP31
HHAEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   23.2s finished
<matplotlib.figure.Figure at 0x22812be8908>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': 0.09989185423383322, 'observations_count': 398}
----------------------------------
81
IADLHP31
ERTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.8s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   26.4s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x2280596e7b8>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': 0.13763853343642518, 'observations_count': 374}
----------------------------------
82
IADLHP31
IPTEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   25.1s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22814ce9ba8>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 45}
{'test R^2': -0.19020483050213724, 'observations_count': 279}
----------------------------------
83
IADLHP31
RXEXP15
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.5s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   25.6s finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:734: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
<matplotlib.figure.Figure at 0x22814de0908>
{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 65}
{'test R^2': 0.6150197062625082, 'observations_count': 995}
In [13]:
outt_df_summary.to_excel("MEPS_Disease_Expenditure_model_performance_7.12.xlsx")
In [61]:
## overfitting problem: large feature size and small sample size?
## how we treat 0 in response

## compare to previous one where best model is test r^2 0.73 test MAE 0.75; now the OBVEXP column model performs better, 
## ERTOT(medcine), TOTEXP are ok; others sample size small 
In [62]:
# obtotv15 obvexp15
corr
Out[62]:
Pregnant Emphysema Arthritis High Cholesterol Heart Attack Cancer Diabetes High Blood Pressure Asthma Attention-deficit/hyperactivity disorder Coronary Heart Disease Other Heart Disease Stroke Angina Patients_Disease_Count TOTEXP15
Pregnant 1.000000 0.217239 0.221633 0.246454 0.219934 0.203919 0.214450 0.244350 -0.015750 -0.096639 0.224235 0.198269 0.210374 0.215627 -0.157463 -0.051877
Emphysema 0.217239 1.000000 0.324405 0.249685 0.738109 0.575713 0.508478 0.209119 0.087281 -0.512087 0.693918 0.551121 0.709783 0.789386 0.179293 0.065139
Arthritis 0.221633 0.324405 1.000000 0.364878 0.335946 0.337932 0.340357 0.391487 0.113756 -0.149539 0.348650 0.350203 0.333529 0.323572 -0.463997 -0.156275
High Cholesterol 0.246454 0.249685 0.364878 1.000000 0.309263 0.272229 0.391312 0.468717 0.047701 -0.113088 0.333757 0.289947 0.287565 0.277459 -0.516120 -0.120669
Heart Attack 0.219934 0.738109 0.335946 0.309263 1.000000 0.535622 0.514973 0.274723 0.052927 -0.457481 0.801229 0.577286 0.696300 0.779357 0.002330 0.004763
Cancer 0.203919 0.575713 0.337932 0.272229 0.535622 1.000000 0.383797 0.249228 0.038109 -0.344395 0.514767 0.446633 0.527427 0.556803 -0.083479 -0.049739
Diabetes 0.214450 0.508478 0.340357 0.391312 0.514973 0.383797 1.000000 0.372487 0.056131 -0.308465 0.510865 0.409561 0.502006 0.519280 -0.209349 -0.068768
High Blood Pressure 0.244350 0.209119 0.391487 0.468717 0.274723 0.249228 0.372487 1.000000 0.067759 -0.081936 0.303959 0.280562 0.274849 0.235502 -0.565450 -0.150810
Asthma -0.015750 0.087281 0.113756 0.047701 0.052927 0.038109 0.056131 0.067759 1.000000 -0.006540 0.050013 0.067355 0.056576 0.052564 -0.270238 -0.061904
Attention-deficit/hyperactivity disorder -0.096639 -0.512087 -0.149539 -0.113088 -0.457481 -0.344395 -0.308465 -0.081936 -0.006540 1.000000 -0.420852 -0.328782 -0.444667 -0.499098 -0.277016 -0.095568
Coronary Heart Disease 0.224235 0.693918 0.348650 0.333757 0.801229 0.514767 0.510865 0.303959 0.050013 -0.420852 1.000000 0.620832 0.653511 0.756242 -0.090571 -0.031203
Other Heart Disease 0.198269 0.551121 0.350203 0.289947 0.577286 0.446633 0.409561 0.280562 0.067355 -0.328782 0.620832 1.000000 0.541865 0.586812 -0.179530 -0.063473
Stroke 0.210374 0.709783 0.333529 0.287565 0.696300 0.527427 0.502006 0.274849 0.056576 -0.444667 0.653511 0.541865 1.000000 0.706179 0.026351 -0.015526
Angina 0.215627 0.789386 0.323572 0.277459 0.779357 0.556803 0.519280 0.235502 0.052564 -0.499098 0.756242 0.586812 0.706179 1.000000 0.118733 0.043964
Patients_Disease_Count -0.157463 0.179293 -0.463997 -0.516120 0.002330 -0.083479 -0.209349 -0.565450 -0.270238 -0.277016 -0.090571 -0.179530 0.026351 0.118733 1.000000 0.322759
TOTEXP15 -0.051877 0.065139 -0.156275 -0.120669 0.004763 -0.049739 -0.068768 -0.150810 -0.061904 -0.095568 -0.031203 -0.063473 -0.015526 0.043964 0.322759 1.000000
In [63]:
#### cluster disease
In [153]:
disease_f = ["CANCERDX", "CHDDX","STRKDX","OHRTDX","HIBPDX","EMPHDX","DIABDX",
#cancer/ coronary heart disease/ stroke/ other heart disease/ high blood pressure/ Emphysema肺气肿/Diabetes糖尿病
            "ARTHDX", "ASTHDX"
             , "ADHDADDX","PREGNT31"]
#           arthrits关节炎/asthma/ 多动症/ pregnant / independent living screener
In [154]:
# look for diagnosis features
hl={}
for x in df_health.columns:
    hl[x] = (len(df[str(x)].unique()))
hcat_columns = []
hnum_columns = []
for k,v in hl.items():
    if v<=3 and ("DX" in k):
        hcat_columns.append(k)
    else:
        hnum_columns.append(k)
hcat_columns = list(set(hcat_columns).union(set(disease_f))) 
hcat_columns.remove("BPMLDX")
len(hcat_columns)
Out[154]:
14
In [155]:
#
In [156]:
df_dig = df[hcat_columns]
In [157]:
df_dig_corplot.columns
Out[157]:
Index(['EMPHDX', 'DIABDX', 'CHDDX', 'CANCERDX', 'CHOLDX', 'ANGIDX', 'STRKDX',
       'OHRTDX', 'ARTHDX', 'HIBPDX', 'ASTHDX', 'PREGNT31', 'ADHDADDX', 'MIDX'],
      dtype='object')
In [1]:
'''
df_dig = df_dig.fillna(1.5)
df_dig_corplot = df_dig.fillna(0)
df_dig_corplot.columns=["Pregnant", "Emphysema","Arthritis", 
                "High Cholesterol","Heart Attack ", "Cancer","Diabetes", "High Blood Pressure","Asthma", 
                "Attention Disorder",
               "Coronary Heart Disease", "Other Heart Disease","Stroke", "Angina",'sum_disease', 'cluster',
       'TOTEXP15', 'sum_HBP']
X=df_dig
X.shape
'''
Out[1]:
'\ndf_dig = df_dig.fillna(1.5)\ndf_dig_corplot = df_dig.fillna(0)\ndf_dig_corplot.columns=["Pregnant", "Emphysema","Arthritis", \n                "High Cholesterol","Heart Attack ", "Cancer","Diabetes", "High Blood Pressure","Asthma", \n                "Attention Disorder",\n               "Coronary Heart Disease", "Other Heart Disease","Stroke", "Angina",\'sum_disease\', \'cluster\',\n       \'TOTEXP15\', \'sum_HBP\']\nX=df_dig\nX.shape\n'
In [159]:
kmeans = KMeans(n_clusters=4, random_state=0).fit(df_dig)


'''
bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
'''
Out[159]:
'\nbandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500)\n\nms = MeanShift(bandwidth=bandwidth, bin_seeding=True)\nms.fit(X)\nlabels = ms.labels_\ncluster_centers = ms.cluster_centers_\n'
In [160]:
df_dig['Total_Disease_Counts'] = (df_dig[hcat_columns] == 1).sum(axis=1)
In [161]:
df_dig["cluster"] = kmeans.labels_
#df_dig["cluster"] = ms.labels_
In [162]:
df_dig.columns=["Pregnant", "Emphysema","Arthritis", 
                "High Cholesterol","Heart Attack ", "Cancer","Diabetes", "High Blood Pressure","Asthma", 
                "Attention Disorder",
               "Coronary Heart Disease", "Other Heart Disease","Stroke", "Angina","sum_disease","cluster"]
In [163]:
df_dig = pd.concat([df_dig, df.TOTEXP15],axis =1)
In [164]:
df_dig.columns
Out[164]:
Index(['Pregnant', 'Emphysema', 'Arthritis', 'High Cholesterol',
       'Heart Attack ', 'Cancer', 'Diabetes', 'High Blood Pressure', 'Asthma',
       'Attention Disorder', 'Coronary Heart Disease', 'Other Heart Disease',
       'Stroke', 'Angina', 'sum_disease', 'cluster', 'TOTEXP15'],
      dtype='object')
In [22]:
df_mean = df_dig.groupby(['cluster']).mean()
print("mean across different clusters: ")
print(df_mean.TOTEXP15)
print("median across different clusters: ")
df_median = df_dig.groupby(['cluster']).median()
print(df_median.TOTEXP15)


df_dig[['sum_disease',"cluster"]].groupby(['cluster']).agg(['mean'])


df_mean = df_dig[['sum_disease',"cluster"]].groupby(['cluster']).mean()
print("mean across different clusters: ")
print(df_mean.sum_disease)
mean across different clusters: 
cluster
0     2605.648589
1     3169.524889
2     5042.968241
3    10700.548126
Name: TOTEXP15, dtype: float64
median across different clusters: 
cluster
0     331.0
1     399.0
2    1302.0
3    4460.5
Name: TOTEXP15, dtype: float64
mean across different clusters: 
cluster
0    0.320519
1    0.718523
2    1.845740
3    3.610597
Name: sum_disease, dtype: float64
In [173]:
df_mean = df_dig[['sum_disease',"cluster"]].groupby(['cluster']).mean()
print("mean of sum_disease across clusters: ")
print(df_mean.sum_disease)


df_mean = df_dig.groupby(['cluster']).mean()
print("mean of TOTEXP across clusters: ")
print(df_mean.TOTEXP15)
mean of sum_disease across clusters: 
cluster
0    0.320519
1    0.718523
2    1.845740
3    3.610597
Name: sum_disease, dtype: float64
mean of TOTEXP across clusters: 
cluster
0     2605.648589
1     3169.524889
2     5042.968241
3    10700.548126
Name: TOTEXP15, dtype: float64
In [79]:
#df_dig.sort_values(by=['cluster'])
In [146]:
corr = df_dig.corr()
In [147]:
corr
Out[147]:
Pregnant Emphysema Arthritis High Cholesterol Heart Attack Cancer Diabetes High Blood Pressure Asthma Attention Disorder Coronary Heart Disease Other Heart Disease Stroke Angina sum_disease cluster TOTEXP15
Pregnant 1.000000 0.508478 0.693918 0.575713 0.249685 0.789386 0.709783 0.551121 0.324405 0.209119 0.087281 0.217239 -0.512087 0.738109 -0.035496 0.179293 0.065139
Emphysema 0.508478 1.000000 0.510865 0.383797 0.391312 0.519280 0.502006 0.409561 0.340357 0.372487 0.056131 0.214450 -0.308465 0.514973 -0.255421 -0.209349 -0.068768
Arthritis 0.693918 0.510865 1.000000 0.514767 0.333757 0.756242 0.653511 0.620832 0.348650 0.303959 0.050013 0.224235 -0.420852 0.801229 -0.079846 -0.090571 -0.031203
High Cholesterol 0.575713 0.383797 0.514767 1.000000 0.272229 0.556803 0.527427 0.446633 0.337932 0.249228 0.038109 0.203919 -0.344395 0.535622 -0.145761 -0.083479 -0.049739
Heart Attack 0.249685 0.391312 0.333757 0.272229 1.000000 0.277459 0.287565 0.289947 0.364878 0.468717 0.047701 0.246454 -0.113088 0.309263 -0.673630 -0.516120 -0.120669
Cancer 0.789386 0.519280 0.756242 0.556803 0.277459 1.000000 0.706179 0.586812 0.323572 0.235502 0.052564 0.215627 -0.499098 0.779357 -0.028169 0.118733 0.043964
Diabetes 0.709783 0.502006 0.653511 0.527427 0.287565 0.706179 1.000000 0.541865 0.333529 0.274849 0.056576 0.210374 -0.444667 0.696300 -0.088618 0.026351 -0.015526
High Blood Pressure 0.551121 0.409561 0.620832 0.446633 0.289947 0.586812 0.541865 1.000000 0.350203 0.280562 0.067355 0.198269 -0.328782 0.577286 -0.139467 -0.179530 -0.063473
Asthma 0.324405 0.340357 0.348650 0.337932 0.364878 0.323572 0.333529 0.350203 1.000000 0.391487 0.113756 0.221633 -0.149539 0.335946 -0.507804 -0.463997 -0.156275
Attention Disorder 0.209119 0.372487 0.303959 0.249228 0.468717 0.235502 0.274849 0.280562 0.391487 1.000000 0.067759 0.244350 -0.081936 0.274723 -0.752245 -0.565450 -0.150810
Coronary Heart Disease 0.087281 0.056131 0.050013 0.038109 0.047701 0.052564 0.056576 0.067355 0.113756 0.067759 1.000000 -0.015750 -0.006540 0.052927 -0.070195 -0.270238 -0.061904
Other Heart Disease 0.217239 0.214450 0.224235 0.203919 0.246454 0.215627 0.210374 0.198269 0.221633 0.244350 -0.015750 1.000000 -0.096639 0.219934 -0.228813 -0.157463 -0.051877
Stroke -0.512087 -0.308465 -0.420852 -0.344395 -0.113088 -0.499098 -0.444667 -0.328782 -0.149539 -0.081936 -0.006540 -0.096639 1.000000 -0.457481 -0.031815 -0.277016 -0.095568
Angina 0.738109 0.514973 0.801229 0.535622 0.309263 0.779357 0.696300 0.577286 0.335946 0.274723 0.052927 0.219934 -0.457481 1.000000 -0.053393 0.002330 0.004763
sum_disease -0.035496 -0.255421 -0.079846 -0.145761 -0.673630 -0.028169 -0.088618 -0.139467 -0.507804 -0.752245 -0.070195 -0.228813 -0.031815 -0.053393 1.000000 0.628748 0.164353
cluster 0.179293 -0.209349 -0.090571 -0.083479 -0.516120 0.118733 0.026351 -0.179530 -0.463997 -0.565450 -0.270238 -0.157463 -0.277016 0.002330 0.628748 1.000000 0.322759
TOTEXP15 0.065139 -0.068768 -0.031203 -0.049739 -0.120669 0.043964 -0.015526 -0.063473 -0.156275 -0.150810 -0.061904 -0.051877 -0.095568 0.004763 0.164353 0.322759 1.000000
In [131]:
corr.Diabetes.sort_values(ascending=False) #高血压 糖尿病 心脏病 ,heart attack,stroke, ANGINA
Out[131]:
Diabetes                  1.000000
Angina                    0.519280
Heart Attack              0.514973
Coronary Heart Disease    0.510865
Emphysema                 0.508478
Stroke                    0.502006
Other Heart Disease       0.409561
High Cholesterol          0.391312
Cancer                    0.383797
High Blood Pressure       0.372487
Arthritis                 0.340357
Pregnant                  0.214450
Asthma                    0.056131
Attention Disorder       -0.308465
Name: Diabetes, dtype: float64
In [85]:
df_dig['sum_HBP'] = (df_dig[["High Blood Pressure","Diabetes","Coronary Heart Disease","Stroke"]] == 1).sum(axis=1)
In [2]:
#corr.CHDDX.sort_values(ascending=False)
In [3]:
#corr.CHDDX.sort_values(ascending=False) #coronary heart disease      #doc  7.13:   prem 700 <->1.2k  
In [4]:
#corr.DIABDX.sort_values(ascending=False)
In [92]:
# diabetes vs chddx vs cholestrol vs heart attack vs othear kinds of haert disease vs angina
In [151]:
corr.TOTEXP15.sort_values(ascending=False) #cancer
Out[151]:
TOTEXP15                  1.000000
cluster                   0.322759
sum_disease               0.164353
Pregnant                  0.065139
Cancer                    0.043964
Angina                    0.004763
Diabetes                 -0.015526
Arthritis                -0.031203
High Cholesterol         -0.049739
Other Heart Disease      -0.051877
Coronary Heart Disease   -0.061904
High Blood Pressure      -0.063473
Emphysema                -0.068768
Stroke                   -0.095568
Heart Attack             -0.120669
Attention Disorder       -0.150810
Asthma                   -0.156275
Name: TOTEXP15, dtype: float64
In [134]:
corr.columns
Out[134]:
Index(['FAMSZEYR', 'Highest Education', 'Office Based Non-Physician Visits',
       'MARRY31X', 'EDUYRDG', 'EDRECODE', 'HIBPAGED', 'CHDAGED', 'ANGIAGED',
       'MIAGED', 'OHRTAGED', 'STRKAGED', 'EMPHAGED', 'CHOLAGED', 'DIABAGED',
       'ARTHAGED', 'Cancer Diagnosis', 'ADHDAGED', 'MOMPRO42', 'DADPRO42',
       'UNHAP42', 'SCHLBH42', 'Age', 'ADUPRO42', 'NERVAF42', 'SIBPRO42',
       'Perceived Health Status', 'SPRPRO42', 'SCHPRO42',
       'Attitude towards Insurance', 'Total Prescribed Medcine', 'CHAPPT42',
       'CHHECR42', 'CHBMIX42', 'BPCHEK53', 'CHOLCK53', 'CHECK53', 'FLUSHT53',
       'PSA53', 'PAPSMR53', 'BRSTEX53', 'MAMOGR53', 'BSTST53', 'CLNTST53',
       'SGMTST53', 'BMINDX53', 'SEATBE53', 'ADPRXY42', 'Family Income Index',
       'ADHECR42', 'PCS42', 'MCS42', 'K6SUM42', 'PHQ242', 'ADCMPM42',
       'DSA1C53', 'TYPEPE42', 'HOUR31', 'POVLEV15', 'VETSP15X',
       'Total_Expenditure', 'Total Out_patients Visits', 'Working Hours',
       'BMI', 'TOTAL Emergency Room Visits', 'Total Office_Based Visits'],
      dtype='object')
In [5]:
#corr.TOTEXP15.sort_values(ascending=False) #total exp
In [95]:
corr.HIBPDX .sort_values(ascending=False)
Out[95]:
HIBPDX         1.000000
CHOLDX         0.468717
ARTHDX         0.391487
DIABDX         0.372487
CHDDX          0.303959
OHRTDX         0.280562
STRKDX         0.274849
MIDX           0.274723
CANCERDX       0.249228
PREGNT31       0.244350
ANGIDX         0.235502
EMPHDX         0.209119
ASTHDX         0.067759
ADHDADDX      -0.081936
TOTEXP15      -0.150810
sum_disease   -0.565450
cluster       -0.752245
Name: HIBPDX, dtype: float64
In [96]:
# 1.people with diabetes are twice likely to get coronary disease and stroke than people without diebetes
# 2.cancer vs lung disesea
# ----------
# is it poss
In [97]:
df_dig.sum_disease.describe()
Out[97]:
count    35427.000000
mean         1.097045
std          1.629472
min          0.000000
25%          0.000000
50%          0.000000
75%          2.000000
max         12.000000
Name: sum_disease, dtype: float64
In [166]:
#df_dig.iloc[:,-:]
corr.columns
Out[166]:
Index(['Pregnant', 'Emphysema', 'Arthritis', 'High Cholesterol',
       'Heart Attack ', 'Cancer', 'Diabetes', 'High Blood Pressure', 'Asthma',
       'Attention Disorder', 'Coronary Heart Disease', 'Other Heart Disease',
       'Stroke', 'Angina', 'sum_disease', 'cluster', 'TOTEXP15'],
      dtype='object')
In [169]:
ax = sns.distplot(df_dig.sum_disease, kde=False)
ax.set(xlabel='Number_Disease_Diagnosed', ylabel='Count')
ax.set(title = "Patients with Multiple Disease")
Out[169]:
[Text(0.5,1,'Patients with Multiple Disease')]
In [6]:
#sns.distplot(df_dig.sum_HBP, kde=False)
In [101]:
df_dig.query('DIABDX == 1').shape[0] #
Out[101]:
2809
In [102]:
df_dig.query('DIABDX == 1 & HIBPDX == 1').shape[0]
Out[102]:
2165
In [103]:
df_dig.query('DIABDX  == 1 & (CHDDX == 1 | OHRTDX==1 |MIDX ==1) & HIBPDX == 1').shape[0]
Out[103]:
777
In [104]:
df_dig.query('STRKDX  == 1 & (CHDDX == 1 | OHRTDX==1 |MIDX ==1) & HIBPDX == 1 & DIABDX == 1').shape[0]
Out[104]:
184
In [105]:
df_dig.query('DIABDX == 1 & CHDDX == 1').shape
Out[105]:
(476, 18)
In [106]:
df_dig.query('CHDDX == 1').shape
Out[106]:
(1290, 18)
In [107]:
df_dig.query('CANCERDX == 1 & EMPHDX == 1').shape
Out[107]:
(144, 18)
In [108]:
df_dig.query('PREGNT31  == 1').shape
Out[108]:
(398, 18)
In [109]:
df_dig.query('HIBPDX == 1 & PREGNT31 == 1').shape
Out[109]:
(33, 18)
In [130]:
df_dig_corplott = df_dig_corplot.drop(['sum_disease', 'cluster',"sum_HBP","TOTEXP15"], axis=1)
corr = df_dig_corplott.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = 1
# plt corr heatmap
f, ax = plt.subplots(figsize=(8, 6))
cmap = sns.diverging_palette(230, 9, as_cmap=1)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1,vmin=-0.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .6})
Out[130]:
<matplotlib.axes._subplots.AxesSubplot at 0x2280363bd30>
In [ ]:
# emphdc vs angindx
# diabdx vs hbp
# choldx vs strkdx

for cluster Notice that K-means gives good linear/corr with exp and sum of disease if we use mean_shifting for cluster, the linear relation is not that obvious; but still, corr is preserved

In [132]:
corr
Out[132]:
FAMSZEYR Highest Education Office Based Non-Physician Visits MARRY31X EDUYRDG EDRECODE HIBPAGED CHDAGED ANGIAGED MIAGED ... TYPEPE42 HOUR31 POVLEV15 VETSP15X Total_Expenditure Total Out_patients Visits Working Hours BMI TOTAL Emergency Room Visits Total Office_Based Visits
FAMSZEYR 1.000000 -0.759162 -0.759292 0.639506 -0.304450 -0.499569 -0.301294 -0.633928 -0.635254 -0.606998 ... 0.057571 0.025933 -0.437714 -0.140968 -0.431339 -0.494567 -0.347611 -0.314449 -0.269297 -0.607514
Highest Education -0.759162 1.000000 0.999992 -0.790148 0.134040 0.335298 0.599935 0.881144 0.872202 0.880051 ... -0.088911 -0.035723 0.372919 0.096772 0.297572 0.385605 0.226690 0.229241 0.126176 0.495021
Office Based Non-Physician Visits -0.759292 0.999992 1.000000 -0.790208 0.133057 0.334810 0.600371 0.880824 0.872061 0.879904 ... -0.089031 -0.035741 0.371741 0.097091 0.297992 0.385477 0.226369 0.229535 0.127455 0.495330
MARRY31X 0.639506 -0.790148 -0.790208 1.000000 -0.358971 -0.649085 -0.401184 -0.560091 -0.586797 -0.558782 ... 0.159540 -0.156534 -0.545577 -0.144249 -0.307604 -0.365105 -0.243605 -0.253181 -0.099768 -0.454568
EDUYRDG -0.304450 0.134040 0.133057 -0.358971 1.000000 0.865348 -0.055368 -0.020174 0.024153 -0.000412 ... -0.065903 0.223520 0.586395 0.082637 0.127595 0.194344 0.162767 0.132228 0.022222 0.105291
EDRECODE -0.499569 0.335298 0.334810 -0.649085 0.865348 1.000000 0.096310 0.077318 0.117834 0.079699 ... -0.137600 0.251348 0.616532 0.138725 0.206518 0.272352 0.221532 0.198048 0.091046 0.255315
HIBPAGED -0.301294 0.599935 0.600371 -0.401184 -0.055368 0.096310 1.000000 0.928027 0.924447 0.938599 ... 0.022754 -0.201778 0.193472 0.045798 -0.156771 0.126887 0.064039 0.045173 0.001134 0.019003
CHDAGED -0.633928 0.881144 0.880824 -0.560091 -0.020174 0.077318 0.928027 1.000000 0.975234 0.978291 ... -0.033087 -0.187635 0.209144 -0.023774 0.177515 0.202325 0.069820 0.046074 -0.008183 0.321559
ANGIAGED -0.635254 0.872202 0.872061 -0.586797 0.024153 0.117834 0.924447 0.975234 1.000000 0.955629 ... -0.080433 -0.152701 0.258896 -0.010266 0.178495 0.236807 0.114539 0.057154 -0.019720 0.314693
MIAGED -0.606998 0.880051 0.879904 -0.558782 -0.000412 0.079699 0.938599 0.978291 0.955629 1.000000 ... -0.078187 -0.202589 0.218740 -0.011778 0.162098 0.172839 0.034296 0.044797 -0.050688 0.293008
OHRTAGED -0.663016 0.895592 0.895776 -0.627266 -0.090811 0.103726 0.901247 0.950451 0.922111 0.933987 ... -0.052100 -0.165529 0.196109 0.037892 0.325665 0.313656 0.154182 0.178098 0.128562 0.472983
STRKAGED -0.640351 0.897860 0.897911 -0.612950 -0.025797 0.097007 0.957339 0.957171 0.942455 0.965536 ... -0.071750 -0.158708 0.238798 -0.016105 0.178573 0.174154 0.037694 0.058992 -0.023303 0.320997
EMPHAGED -0.569342 0.877110 0.877161 -0.649082 0.057139 0.156080 0.916310 0.864621 0.844327 0.918648 ... -0.071273 -0.119809 0.311980 0.016062 0.161399 0.138672 -0.009913 0.080667 -0.100102 0.287769
CHOLAGED -0.670461 0.929209 0.929492 -0.607045 -0.058071 0.071774 0.976117 0.942513 0.928474 0.949788 ... -0.065048 -0.183077 0.203850 0.000008 0.263160 0.267178 0.115426 0.128445 0.034964 0.419135
DIABAGED -0.633894 0.898266 0.898447 -0.606268 0.001099 0.108079 0.967015 0.925593 0.915226 0.934955 ... -0.106945 -0.126733 0.274727 -0.012561 0.148314 0.157855 0.026306 0.056395 -0.077556 0.290618
ARTHAGED -0.610173 0.876267 0.876526 -0.636763 -0.048014 0.133816 0.941606 0.905012 0.895890 0.926920 ... -0.091589 -0.151053 0.288410 -0.016361 0.180946 0.184175 0.049957 0.082522 -0.031030 0.301489
Cancer Diagnosis -0.754473 0.913575 0.913943 -0.820688 0.160712 0.409927 0.520052 0.699012 0.688666 0.720665 ... -0.066373 -0.076482 0.370723 0.156369 0.403035 0.460313 0.284558 0.330870 0.200268 0.614810
ADHDAGED -0.417694 0.735148 0.735243 -0.562442 -0.049416 0.181418 0.483428 0.769101 0.703932 0.764037 ... -0.077327 -0.017934 0.220964 0.017440 0.064649 0.129608 0.038421 0.073066 0.053001 0.179580
MOMPRO42 0.006312 -0.196510 -0.196079 0.024387 0.074811 0.097854 -0.475349 -0.351884 -0.359003 -0.357307 ... -0.084626 0.156503 -0.022889 0.017404 0.145121 0.035529 0.031461 0.017651 0.055650 0.123627
DADPRO42 -0.064824 -0.160773 -0.160510 0.014232 0.023466 0.080941 -0.395538 -0.301480 -0.294730 -0.311675 ... -0.069433 0.090898 -0.116360 0.062118 0.111901 0.074187 0.067993 0.041639 0.105555 0.129254
UNHAP42 0.044905 -0.216648 -0.217204 0.079715 0.034852 0.002052 -0.620948 -0.340609 -0.359892 -0.356971 ... 0.045552 0.057346 -0.088281 -0.018832 0.293883 0.026232 0.010343 0.014809 0.030838 0.215721
SCHLBH42 0.087957 -0.336087 -0.336117 0.121088 0.167853 0.034186 -0.623468 -0.488843 -0.473608 -0.490677 ... -0.095963 0.224032 -0.090790 -0.047943 0.213915 -0.063230 -0.060525 -0.007916 -0.017481 0.095843
Age 0.031865 -0.203385 -0.204046 0.070284 0.038199 -0.009473 -0.684541 -0.318852 -0.321470 -0.339188 ... -0.066088 0.137534 -0.078210 -0.036070 0.479938 0.031965 0.000397 0.035810 0.039362 0.307321
ADUPRO42 0.038947 -0.222318 -0.223057 0.085090 0.035497 -0.014663 -0.698285 -0.311371 -0.312337 -0.320696 ... -0.064091 0.061067 -0.090051 -0.034639 0.421996 0.019447 -0.003917 0.023032 0.026450 0.276136
NERVAF42 0.052805 -0.237718 -0.238373 0.105155 0.031262 -0.016566 -0.641881 -0.355810 -0.359457 -0.369864 ... -0.023249 0.047175 -0.089456 -0.021774 0.294934 0.014417 0.001132 0.013578 0.019267 0.208299
SIBPRO42 -0.182548 -0.271525 -0.271375 0.121207 0.093490 0.042455 -0.465703 -0.322674 -0.315229 -0.334138 ... 0.014559 -0.196179 0.072525 0.009454 0.137799 0.071452 0.072295 0.042453 0.091130 0.100909
Perceived Health Status 0.045068 -0.248385 -0.249027 0.106622 0.032219 -0.035894 -0.701554 -0.339696 -0.324575 -0.353597 ... -0.075532 0.107100 -0.104178 -0.047332 0.421213 0.016479 -0.005407 0.012907 0.032286 0.277325
SPRPRO42 0.066791 -0.268427 -0.268309 0.101660 0.102457 0.019850 -0.585490 -0.365333 -0.341369 -0.381557 ... -0.068252 0.050737 -0.088792 -0.023005 0.298975 0.016500 0.005223 0.092095 0.053068 0.170560
SCHPRO42 0.079902 -0.342563 -0.342561 0.123760 0.217187 0.056042 -0.618561 -0.499628 -0.478979 -0.501591 ... -0.077461 0.216371 -0.075989 -0.050278 0.251114 -0.044491 -0.046068 0.077408 0.004158 0.100310
Attitude towards Insurance 0.040049 -0.236677 -0.237119 0.096416 0.019699 -0.032014 -0.665186 -0.334054 -0.320638 -0.352384 ... -0.052890 0.140796 -0.104221 -0.041337 0.477995 0.031273 0.004011 0.034102 0.058508 0.299888
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
CHECK53 0.514753 -0.588034 -0.587860 0.475170 -0.307021 -0.309056 -0.126922 -0.497443 -0.521277 -0.502525 ... -0.046491 -0.009324 -0.364298 -0.168274 -0.537378 -0.556374 -0.399106 -0.362483 -0.320684 -0.649477
FLUSHT53 0.560371 -0.627262 -0.627081 0.517514 -0.236104 -0.290545 -0.159174 -0.568857 -0.552153 -0.578598 ... -0.029447 0.048684 -0.345865 -0.147973 -0.535478 -0.531352 -0.369903 -0.340175 -0.290383 -0.644692
PSA53 0.697021 -0.772564 -0.772651 0.568290 -0.279798 -0.365010 -0.640673 -0.614654 -0.605488 -0.620170 ... -0.017026 0.048875 -0.405536 -0.186098 -0.551637 -0.578344 -0.412191 -0.350788 -0.296174 -0.681763
PAPSMR53 -0.159207 0.306298 0.306591 -0.024595 -0.361719 -0.325128 0.371683 0.476933 0.479616 0.485498 ... -0.003512 -0.261117 -0.158935 -0.071220 -0.119443 -0.094355 -0.114849 -0.086374 -0.101589 -0.024762
BRSTEX53 0.248964 -0.142912 -0.142560 0.298181 -0.428713 -0.414113 0.271894 -0.030024 -0.049966 0.000087 ... 0.021891 -0.217969 -0.355162 -0.096927 -0.433705 -0.363672 -0.285340 -0.284539 -0.193215 -0.414111
MAMOGR53 0.619975 -0.672069 -0.672461 0.602791 -0.287187 -0.405953 -0.503505 -0.416602 -0.457087 -0.386457 ... 0.032889 -0.104740 -0.483341 -0.148387 -0.478490 -0.520555 -0.375085 -0.407683 -0.212976 -0.608163
BSTST53 0.539314 -0.628394 -0.628742 0.406466 -0.085615 -0.180694 -0.526776 -0.493771 -0.492991 -0.473816 ... -0.035039 0.107368 -0.203440 -0.172414 -0.482879 -0.512303 -0.374723 -0.312227 -0.281982 -0.590709
CLNTST53 0.672963 -0.711697 -0.711926 0.537005 -0.274910 -0.366443 -0.554003 -0.507927 -0.496817 -0.504036 ... -0.031240 0.042112 -0.403130 -0.181445 -0.584519 -0.613229 -0.453320 -0.412727 -0.330420 -0.688103
SGMTST53 0.351992 -0.402267 -0.402344 0.249938 -0.098929 -0.137369 -0.320665 -0.326568 -0.299643 -0.321647 ... -0.050779 0.070397 -0.144256 -0.112553 -0.353265 -0.351479 -0.250638 -0.252389 -0.216128 -0.392499
BMINDX53 0.042477 -0.074660 -0.073883 -0.042222 -0.037117 0.086599 0.006678 -0.335599 -0.373043 -0.364803 ... -0.024261 0.113366 -0.170189 0.108881 0.138318 0.132253 0.105141 0.136352 0.209941 0.249530
SEATBE53 0.070361 -0.182616 -0.182339 0.200951 -0.166556 -0.145435 -0.182212 -0.173507 -0.205573 -0.202201 ... 0.008377 0.068875 -0.234389 -0.026396 -0.008046 -0.070752 -0.056897 -0.029304 0.067682 0.023072
ADPRXY42 -0.156850 0.212134 0.212595 -0.087926 -0.206868 -0.158162 0.231241 0.209661 0.197759 0.207039 ... -0.024241 -0.162163 -0.119931 -0.005093 0.124442 0.070077 0.001172 0.035597 0.065954 0.203384
Family Income Index -0.621678 0.642499 0.642843 -0.514331 0.204618 0.303396 0.459172 0.426669 0.433517 0.423560 ... 0.060726 -0.100694 0.286475 0.195800 0.724713 0.764663 0.584938 0.485007 0.479679 0.829104
ADHECR42 -0.197397 0.301388 0.301099 -0.235725 0.198545 0.115805 0.299964 0.371847 0.427506 0.383786 ... -0.079210 -0.004040 0.294411 -0.042875 -0.024784 0.020769 -0.014441 0.004452 -0.211326 0.003731
PCS42 0.573761 -0.632986 -0.633758 0.411145 0.112111 -0.035796 -0.460678 -0.486687 -0.457917 -0.470051 ... -0.075684 0.157362 0.041683 -0.188348 -0.701264 -0.641410 -0.458265 -0.439463 -0.524490 -0.868422
MCS42 0.095582 0.033806 0.033343 -0.086002 0.169241 0.084807 0.156433 0.132884 0.181064 0.158587 ... -0.066096 0.096350 0.268578 -0.117609 -0.366816 -0.327051 -0.270844 -0.209747 -0.465624 -0.451663
K6SUM42 -0.148347 0.003732 0.004372 0.061023 -0.171048 -0.077853 -0.137918 -0.122481 -0.162552 -0.146731 ... 0.068971 -0.116771 -0.271140 0.129918 0.424861 0.383715 0.310968 0.251094 0.503513 0.530868
PHQ242 -0.160654 0.043937 0.044621 0.044724 -0.201595 -0.105288 -0.092145 -0.077354 -0.119895 -0.098590 ... 0.069907 -0.127504 -0.284664 0.124882 0.417766 0.370137 0.291684 0.245659 0.486266 0.532084
ADCMPM42 0.081198 -0.136137 -0.136505 0.049281 0.069447 0.061756 -0.143858 -0.163826 -0.135252 -0.146795 ... 0.006567 0.055116 0.072427 0.003648 -0.085005 -0.055908 -0.031006 -0.085973 -0.044062 -0.097115
DSA1C53 0.120694 -0.183257 -0.183229 0.212996 -0.194567 -0.174686 -0.161898 -0.138804 -0.252745 -0.118962 ... 0.056176 0.002048 -0.205301 -0.102450 -0.100619 -0.155505 -0.137897 -0.075390 -0.017454 -0.131103
TYPEPE42 0.057571 -0.088911 -0.089031 0.159540 -0.065903 -0.137600 0.022754 -0.033087 -0.080433 -0.078187 ... 1.000000 -0.066949 -0.083472 -0.007364 0.010972 0.052463 0.061585 0.002214 0.045429 0.015824
HOUR31 0.025933 -0.035723 -0.035741 -0.156534 0.223520 0.251348 -0.201778 -0.187635 -0.152701 -0.202589 ... -0.066949 1.000000 0.239605 -0.009900 -0.010718 -0.127691 -0.098599 -0.063216 -0.120511 -0.059604
POVLEV15 -0.437714 0.372919 0.371741 -0.545577 0.586395 0.616532 0.193472 0.209144 0.258896 0.218740 ... -0.083472 0.239605 1.000000 0.107905 0.104373 0.232796 0.190076 0.114693 -0.090375 0.112847
VETSP15X -0.140968 0.096772 0.097091 -0.144249 0.082637 0.138725 0.045798 -0.023774 -0.010266 -0.011778 ... -0.007364 -0.009900 0.107905 1.000000 0.134908 0.166413 0.121110 0.135828 0.111166 0.202580
Total_Expenditure -0.431339 0.297572 0.297992 -0.307604 0.127595 0.206518 -0.156771 0.177515 0.178495 0.162098 ... 0.010972 -0.010718 0.104373 0.134908 1.000000 0.676538 0.536499 0.545682 0.558101 0.811630
Total Out_patients Visits -0.494567 0.385605 0.385477 -0.365105 0.194344 0.272352 0.126887 0.202325 0.236807 0.172839 ... 0.052463 -0.127691 0.232796 0.166413 0.676538 1.000000 0.953949 0.453903 0.474622 0.725799
Working Hours -0.347611 0.226690 0.226369 -0.243605 0.162767 0.221532 0.064039 0.069820 0.114539 0.034296 ... 0.061585 -0.098599 0.190076 0.121110 0.536499 0.953949 1.000000 0.351094 0.365314 0.539475
BMI -0.314449 0.229241 0.229535 -0.253181 0.132228 0.198048 0.045173 0.046074 0.057154 0.044797 ... 0.002214 -0.063216 0.114693 0.135828 0.545682 0.453903 0.351094 1.000000 0.354724 0.507594
TOTAL Emergency Room Visits -0.269297 0.126176 0.127455 -0.099768 0.022222 0.091046 0.001134 -0.008183 -0.019720 -0.050688 ... 0.045429 -0.120511 -0.090375 0.111166 0.558101 0.474622 0.365314 0.354724 1.000000 0.564941
Total Office_Based Visits -0.607514 0.495021 0.495330 -0.454568 0.105291 0.255315 0.019003 0.321559 0.314693 0.293008 ... 0.015824 -0.059604 0.112847 0.202580 0.811630 0.725799 0.539475 0.507594 0.564941 1.000000

66 rows × 66 columns

In [ ]: